{ "training_args": { "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_rte_ff_v1", "overwrite_output_dir": false, "do_train": false, "do_eval": true, "do_predict": false, "eval_strategy": "steps", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 4, "eval_accumulation_steps": null, "eval_delay": 0, "torch_empty_cache_steps": null, "learning_rate": 2e-05, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": -1, "lr_scheduler_type": "linear", "lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_rte_ff_v1/runs/Sep30_22-05-59_gx11", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 20, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 500, "save_total_limit": null, "save_safetensors": true, "save_on_each_node": false, "save_only_model": false, "restore_callback_states_from_checkpoint": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 39, "dataloader_num_workers": 0, "dataloader_prefetch_factor": null, "past_index": -1, "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_rte_ff_v1", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": { "min_num_params": 0, "xla": false, "xla_fsdp_v2": false, "xla_fsdp_grad_ckpt": false }, "fsdp_transformer_layer_cls_to_wrap": null, "accelerator_config": { "split_batches": false, "dispatch_batches": null, "even_batches": true, "use_seedable_sampler": true, "non_blocking": false, "gradient_accumulation_kwargs": null }, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": [], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "dataloader_persistent_workers": false, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": "", "hub_private_repo": null, "hub_always_push": false, "gradient_checkpointing": false, "gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, "include_for_metrics": [], "eval_do_concat_batches": true, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": "", "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "include_tokens_per_second": false, "include_num_input_tokens_seen": false, "neftune_noise_alpha": null, "optim_target_modules": null, "batch_eval_metrics": false, "eval_on_start": false, "use_liger_kernel": false, "eval_use_gather_object": false, "average_tokens_across_devices": false }, "lora_config": null, "flops": { "eval": 5333250945655808, "train": 9219668431260864, "total": 14552919376916672 }, "total": { "total": 46259.1033, "train": 35409.45206, "eval": 10849.65124 }, "logs": [ { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:06.153312", "step": 0, "epoch": 0 }, { "type": "pplx", "content": 68335452.534585, "timestamp": "2025-09-30 22:06:06.159801", "step": 0, "epoch": 0 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:06.239586", "step": 0, "epoch": 1 }, { "type": "loss", "content": 0.3234884440898895, "timestamp": "2025-09-30 22:06:06.244678", "step": 1, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:06.292388", "step": 1, "epoch": 1 }, { "type": "loss", "content": 0.35814759135246277, "timestamp": "2025-09-30 22:06:06.295193", "step": 2, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:06.346912", "step": 2, "epoch": 1 }, { "type": "loss", "content": 0.31901952624320984, "timestamp": "2025-09-30 22:06:06.351399", "step": 3, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:06.388643", "step": 3, "epoch": 1 }, { "type": "loss", "content": 0.3721208870410919, "timestamp": "2025-09-30 22:06:06.455471", "step": 4, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:06.506862", "step": 4, "epoch": 1 }, { "type": "loss", "content": 0.10016343742609024, "timestamp": "2025-09-30 22:06:06.509505", "step": 5, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:06.548654", "step": 5, "epoch": 1 }, { "type": "loss", "content": 0.12158586829900742, "timestamp": "2025-09-30 22:06:06.562010", "step": 6, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:06.622986", "step": 6, "epoch": 1 }, { "type": "loss", "content": 0.030598539859056473, "timestamp": "2025-09-30 22:06:06.627321", "step": 7, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:06.662699", "step": 7, "epoch": 1 }, { "type": "loss", "content": 0.004455908201634884, "timestamp": "2025-09-30 22:06:06.698078", "step": 8, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:06.743279", "step": 8, "epoch": 1 }, { "type": "loss", "content": 0.01093387696892023, "timestamp": "2025-09-30 22:06:06.748884", "step": 9, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:06.781613", "step": 9, "epoch": 1 }, { "type": "loss", "content": 0.04314374923706055, "timestamp": "2025-09-30 22:06:06.784665", "step": 10, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:06.820810", "step": 10, "epoch": 1 }, { "type": "loss", "content": 0.02420825883746147, "timestamp": "2025-09-30 22:06:06.828663", "step": 11, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:06.862456", "step": 11, "epoch": 1 }, { "type": "loss", "content": 0.044014859944581985, "timestamp": "2025-09-30 22:06:06.887934", "step": 12, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:06.922528", "step": 12, "epoch": 1 }, { "type": "loss", "content": 0.0571591854095459, "timestamp": "2025-09-30 22:06:06.925638", "step": 13, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:06.966043", "step": 13, "epoch": 1 }, { "type": "loss", "content": 0.006987304426729679, "timestamp": "2025-09-30 22:06:06.969392", "step": 14, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:07.008983", "step": 14, "epoch": 1 }, { "type": "loss", "content": 0.039628542959690094, "timestamp": "2025-09-30 22:06:07.013366", "step": 15, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:07.048795", "step": 15, "epoch": 1 }, { "type": "loss", "content": 0.023766744881868362, "timestamp": "2025-09-30 22:06:07.078880", "step": 16, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:07.124202", "step": 16, "epoch": 1 }, { "type": "loss", "content": 0.020519467070698738, "timestamp": "2025-09-30 22:06:07.132232", "step": 17, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:07.197808", "step": 17, "epoch": 1 }, { "type": "loss", "content": 0.041614633053541183, "timestamp": "2025-09-30 22:06:07.201921", "step": 18, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:07.244302", "step": 18, "epoch": 1 }, { "type": "loss", "content": 0.04117533192038536, "timestamp": "2025-09-30 22:06:07.252496", "step": 19, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:07.293362", "step": 19, "epoch": 1 }, { "type": "loss", "content": 0.045252926647663116, "timestamp": "2025-09-30 22:06:07.321180", "step": 20, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:07.353793", "step": 20, "epoch": 1 }, { "type": "loss", "content": 0.027239639312028885, "timestamp": "2025-09-30 22:06:07.362465", "step": 21, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:07.396025", "step": 21, "epoch": 1 }, { "type": "loss", "content": 0.023690115660429, "timestamp": "2025-09-30 22:06:07.407312", "step": 22, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:07.448922", "step": 22, "epoch": 1 }, { "type": "loss", "content": 0.02684900537133217, "timestamp": "2025-09-30 22:06:07.460668", "step": 23, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:07.498815", "step": 23, "epoch": 1 }, { "type": "loss", "content": 0.026328429579734802, "timestamp": "2025-09-30 22:06:07.531164", "step": 24, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:07.569855", "step": 24, "epoch": 1 }, { "type": "loss", "content": 0.02680337429046631, "timestamp": "2025-09-30 22:06:07.577922", "step": 25, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:07.619007", "step": 25, "epoch": 1 }, { "type": "loss", "content": 0.036179907619953156, "timestamp": "2025-09-30 22:06:07.628077", "step": 26, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:07.675201", "step": 26, "epoch": 1 }, { "type": "loss", "content": 0.04021405801177025, "timestamp": "2025-09-30 22:06:07.686454", "step": 27, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:07.738628", "step": 27, "epoch": 1 }, { "type": "loss", "content": 0.03643062338232994, "timestamp": "2025-09-30 22:06:07.764616", "step": 28, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:07.808584", "step": 28, "epoch": 1 }, { "type": "loss", "content": 0.02700633928179741, "timestamp": "2025-09-30 22:06:07.813260", "step": 29, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:07.858983", "step": 29, "epoch": 1 }, { "type": "loss", "content": 0.028900552541017532, "timestamp": "2025-09-30 22:06:07.861506", "step": 30, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:07.899420", "step": 30, "epoch": 1 }, { "type": "loss", "content": 0.03417787328362465, "timestamp": "2025-09-30 22:06:07.906483", "step": 31, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:07.945767", "step": 31, "epoch": 1 }, { "type": "loss", "content": 0.029415259137749672, "timestamp": "2025-09-30 22:06:07.970091", "step": 32, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:08.002736", "step": 32, "epoch": 1 }, { "type": "loss", "content": 0.018809909000992775, "timestamp": "2025-09-30 22:06:08.008770", "step": 33, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:08.040796", "step": 33, "epoch": 1 }, { "type": "loss", "content": 0.02093845047056675, "timestamp": "2025-09-30 22:06:08.047832", "step": 34, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:08.085350", "step": 34, "epoch": 1 }, { "type": "loss", "content": 0.036695707589387894, "timestamp": "2025-09-30 22:06:08.090001", "step": 35, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:08.133354", "step": 35, "epoch": 1 }, { "type": "loss", "content": 0.034678272902965546, "timestamp": "2025-09-30 22:06:08.162471", "step": 36, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:08.195249", "step": 36, "epoch": 1 }, { "type": "loss", "content": 0.025288773700594902, "timestamp": "2025-09-30 22:06:08.198670", "step": 37, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:08.238253", "step": 37, "epoch": 1 }, { "type": "loss", "content": 0.030010608956217766, "timestamp": "2025-09-30 22:06:08.248944", "step": 38, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:08.289167", "step": 38, "epoch": 1 }, { "type": "loss", "content": 0.022928962484002113, "timestamp": "2025-09-30 22:06:08.298947", "step": 39, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:08.941089", "step": 39, "epoch": 1 }, { "type": "pplx", "content": 50567875.175425716, "timestamp": "2025-09-30 22:06:08.946960", "step": 39, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:08.984603", "step": 39, "epoch": 1 }, { "type": "loss", "content": 0.021819669753313065, "timestamp": "2025-09-30 22:06:09.009552", "step": 40, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:09.042289", "step": 40, "epoch": 1 }, { "type": "loss", "content": 0.02205788530409336, "timestamp": "2025-09-30 22:06:09.049659", "step": 41, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:09.080398", "step": 41, "epoch": 1 }, { "type": "loss", "content": 0.02417643368244171, "timestamp": "2025-09-30 22:06:09.088249", "step": 42, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:09.124471", "step": 42, "epoch": 1 }, { "type": "loss", "content": 0.022384127601981163, "timestamp": "2025-09-30 22:06:09.132259", "step": 43, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:09.169630", "step": 43, "epoch": 1 }, { "type": "loss", "content": 0.025165708735585213, "timestamp": "2025-09-30 22:06:09.198593", "step": 44, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:09.232888", "step": 44, "epoch": 1 }, { "type": "loss", "content": 0.020649874582886696, "timestamp": "2025-09-30 22:06:09.242566", "step": 45, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:09.278932", "step": 45, "epoch": 1 }, { "type": "loss", "content": 0.024506300687789917, "timestamp": "2025-09-30 22:06:09.285321", "step": 46, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:09.320739", "step": 46, "epoch": 1 }, { "type": "loss", "content": 0.030327823013067245, "timestamp": "2025-09-30 22:06:09.327781", "step": 47, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:09.363739", "step": 47, "epoch": 1 }, { "type": "loss", "content": 0.026908371597528458, "timestamp": "2025-09-30 22:06:09.392728", "step": 48, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:09.435116", "step": 48, "epoch": 1 }, { "type": "loss", "content": 0.023281460627913475, "timestamp": "2025-09-30 22:06:09.441497", "step": 49, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:09.474274", "step": 49, "epoch": 1 }, { "type": "loss", "content": 0.017589669674634933, "timestamp": "2025-09-30 22:06:09.483069", "step": 50, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:09.523298", "step": 50, "epoch": 1 }, { "type": "loss", "content": 0.026965469121932983, "timestamp": "2025-09-30 22:06:09.532137", "step": 51, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:09.571058", "step": 51, "epoch": 1 }, { "type": "loss", "content": 0.0181273240596056, "timestamp": "2025-09-30 22:06:09.599752", "step": 52, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:09.634906", "step": 52, "epoch": 1 }, { "type": "loss", "content": 0.03717336431145668, "timestamp": "2025-09-30 22:06:09.642417", "step": 53, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:09.674098", "step": 53, "epoch": 1 }, { "type": "loss", "content": 0.034199971705675125, "timestamp": "2025-09-30 22:06:09.682337", "step": 54, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:09.718202", "step": 54, "epoch": 1 }, { "type": "loss", "content": 0.03449447080492973, "timestamp": "2025-09-30 22:06:09.725351", "step": 55, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:09.762014", "step": 55, "epoch": 1 }, { "type": "loss", "content": 0.02672051265835762, "timestamp": "2025-09-30 22:06:09.792245", "step": 56, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:09.828974", "step": 56, "epoch": 1 }, { "type": "loss", "content": 0.02459951676428318, "timestamp": "2025-09-30 22:06:09.832573", "step": 57, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:09.869323", "step": 57, "epoch": 1 }, { "type": "loss", "content": 0.038649629801511765, "timestamp": "2025-09-30 22:06:09.877878", "step": 58, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:09.916637", "step": 58, "epoch": 1 }, { "type": "loss", "content": 0.026001831516623497, "timestamp": "2025-09-30 22:06:09.919367", "step": 59, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:09.950252", "step": 59, "epoch": 1 }, { "type": "loss", "content": 0.027564698830246925, "timestamp": "2025-09-30 22:06:09.978868", "step": 60, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:10.023545", "step": 60, "epoch": 1 }, { "type": "loss", "content": 0.026089642196893692, "timestamp": "2025-09-30 22:06:10.026388", "step": 61, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:10.064725", "step": 61, "epoch": 1 }, { "type": "loss", "content": 0.015961112454533577, "timestamp": "2025-09-30 22:06:10.072737", "step": 62, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:10.112461", "step": 62, "epoch": 1 }, { "type": "loss", "content": 0.02579752914607525, "timestamp": "2025-09-30 22:06:10.122008", "step": 63, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:10.156037", "step": 63, "epoch": 1 }, { "type": "loss", "content": 0.02118992619216442, "timestamp": "2025-09-30 22:06:10.181430", "step": 64, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:10.215209", "step": 64, "epoch": 1 }, { "type": "loss", "content": 0.020561648532748222, "timestamp": "2025-09-30 22:06:10.225671", "step": 65, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:10.265401", "step": 65, "epoch": 1 }, { "type": "loss", "content": 0.024944249540567398, "timestamp": "2025-09-30 22:06:10.271671", "step": 66, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:10.311390", "step": 66, "epoch": 1 }, { "type": "loss", "content": 0.02434808574616909, "timestamp": "2025-09-30 22:06:10.315703", "step": 67, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:10.349470", "step": 67, "epoch": 1 }, { "type": "loss", "content": 0.01686147227883339, "timestamp": "2025-09-30 22:06:10.379368", "step": 68, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:10.415927", "step": 68, "epoch": 1 }, { "type": "loss", "content": 0.02224905416369438, "timestamp": "2025-09-30 22:06:10.423488", "step": 69, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:10.459309", "step": 69, "epoch": 1 }, { "type": "loss", "content": 0.028607143089175224, "timestamp": "2025-09-30 22:06:10.469321", "step": 70, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:10.509787", "step": 70, "epoch": 1 }, { "type": "loss", "content": 0.020568443462252617, "timestamp": "2025-09-30 22:06:10.512584", "step": 71, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:10.554598", "step": 71, "epoch": 1 }, { "type": "loss", "content": 0.022160660475492477, "timestamp": "2025-09-30 22:06:10.578901", "step": 72, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:10.617170", "step": 72, "epoch": 1 }, { "type": "loss", "content": 0.02690061740577221, "timestamp": "2025-09-30 22:06:10.620387", "step": 73, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:10.655114", "step": 73, "epoch": 1 }, { "type": "loss", "content": 0.025107603520154953, "timestamp": "2025-09-30 22:06:10.671213", "step": 74, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:10.704822", "step": 74, "epoch": 1 }, { "type": "loss", "content": 0.026476547122001648, "timestamp": "2025-09-30 22:06:10.708188", "step": 75, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:10.741972", "step": 75, "epoch": 1 }, { "type": "loss", "content": 0.032140474766492844, "timestamp": "2025-09-30 22:06:10.767598", "step": 76, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:10.799849", "step": 76, "epoch": 1 }, { "type": "loss", "content": 0.021117273718118668, "timestamp": "2025-09-30 22:06:10.804924", "step": 77, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:10.842499", "step": 77, "epoch": 1 }, { "type": "loss", "content": 0.023193124681711197, "timestamp": "2025-09-30 22:06:10.849349", "step": 78, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:11.470137", "step": 78, "epoch": 1 }, { "type": "pplx", "content": 57662058.52283761, "timestamp": "2025-09-30 22:06:11.476854", "step": 78, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:11.512578", "step": 78, "epoch": 1 }, { "type": "loss", "content": 0.028916534036397934, "timestamp": "2025-09-30 22:06:11.519955", "step": 79, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:11.553866", "step": 79, "epoch": 1 }, { "type": "loss", "content": 0.023038769140839577, "timestamp": "2025-09-30 22:06:11.584898", "step": 80, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:11.626129", "step": 80, "epoch": 1 }, { "type": "loss", "content": 0.023145731538534164, "timestamp": "2025-09-30 22:06:11.630803", "step": 81, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:11.663277", "step": 81, "epoch": 1 }, { "type": "loss", "content": 0.030305249616503716, "timestamp": "2025-09-30 22:06:11.667909", "step": 82, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:11.702379", "step": 82, "epoch": 1 }, { "type": "loss", "content": 0.02923610992729664, "timestamp": "2025-09-30 22:06:11.710410", "step": 83, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:11.750975", "step": 83, "epoch": 1 }, { "type": "loss", "content": 0.02699192799627781, "timestamp": "2025-09-30 22:06:11.775982", "step": 84, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:11.811393", "step": 84, "epoch": 1 }, { "type": "loss", "content": 0.021742191165685654, "timestamp": "2025-09-30 22:06:11.814504", "step": 85, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:11.852633", "step": 85, "epoch": 1 }, { "type": "loss", "content": 0.023906385526061058, "timestamp": "2025-09-30 22:06:11.864993", "step": 86, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:11.908281", "step": 86, "epoch": 1 }, { "type": "loss", "content": 0.02628713846206665, "timestamp": "2025-09-30 22:06:11.917104", "step": 87, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:11.958177", "step": 87, "epoch": 1 }, { "type": "loss", "content": 0.02474205382168293, "timestamp": "2025-09-30 22:06:11.986085", "step": 88, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:12.021239", "step": 88, "epoch": 1 }, { "type": "loss", "content": 0.024998720735311508, "timestamp": "2025-09-30 22:06:12.025688", "step": 89, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:12.056963", "step": 89, "epoch": 1 }, { "type": "loss", "content": 0.024945732206106186, "timestamp": "2025-09-30 22:06:12.064445", "step": 90, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:12.102084", "step": 90, "epoch": 1 }, { "type": "loss", "content": 0.02451312355697155, "timestamp": "2025-09-30 22:06:12.104799", "step": 91, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:12.137058", "step": 91, "epoch": 1 }, { "type": "loss", "content": 0.024372024461627007, "timestamp": "2025-09-30 22:06:12.169005", "step": 92, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:12.205970", "step": 92, "epoch": 1 }, { "type": "loss", "content": 0.0259132981300354, "timestamp": "2025-09-30 22:06:12.215291", "step": 93, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:12.252919", "step": 93, "epoch": 1 }, { "type": "loss", "content": 0.02676175720989704, "timestamp": "2025-09-30 22:06:12.257787", "step": 94, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:12.291285", "step": 94, "epoch": 1 }, { "type": "loss", "content": 0.02507203258574009, "timestamp": "2025-09-30 22:06:12.295781", "step": 95, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:12.333920", "step": 95, "epoch": 1 }, { "type": "loss", "content": 0.024420084431767464, "timestamp": "2025-09-30 22:06:12.362162", "step": 96, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:12.395610", "step": 96, "epoch": 1 }, { "type": "loss", "content": 0.022208968177437782, "timestamp": "2025-09-30 22:06:12.402115", "step": 97, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:12.438826", "step": 97, "epoch": 1 }, { "type": "loss", "content": 0.024541867896914482, "timestamp": "2025-09-30 22:06:12.446117", "step": 98, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:12.484405", "step": 98, "epoch": 1 }, { "type": "loss", "content": 0.027195433154702187, "timestamp": "2025-09-30 22:06:12.492014", "step": 99, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:12.526469", "step": 99, "epoch": 1 }, { "type": "loss", "content": 0.02440270408987999, "timestamp": "2025-09-30 22:06:12.554906", "step": 100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:12.594181", "step": 100, "epoch": 1 }, { "type": "loss", "content": 0.025244589895009995, "timestamp": "2025-09-30 22:06:12.598914", "step": 101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:12.635269", "step": 101, "epoch": 1 }, { "type": "loss", "content": 0.02790530025959015, "timestamp": "2025-09-30 22:06:12.644348", "step": 102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:12.681147", "step": 102, "epoch": 1 }, { "type": "loss", "content": 0.028556600213050842, "timestamp": "2025-09-30 22:06:12.683834", "step": 103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:12.720286", "step": 103, "epoch": 1 }, { "type": "loss", "content": 0.02198629453778267, "timestamp": "2025-09-30 22:06:12.745656", "step": 104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:12.776981", "step": 104, "epoch": 1 }, { "type": "loss", "content": 0.029764581471681595, "timestamp": "2025-09-30 22:06:12.783395", "step": 105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:12.818699", "step": 105, "epoch": 1 }, { "type": "loss", "content": 0.026740163564682007, "timestamp": "2025-09-30 22:06:12.824581", "step": 106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:12.859914", "step": 106, "epoch": 1 }, { "type": "loss", "content": 0.02745322324335575, "timestamp": "2025-09-30 22:06:12.864187", "step": 107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:12.903188", "step": 107, "epoch": 1 }, { "type": "loss", "content": 0.025504307821393013, "timestamp": "2025-09-30 22:06:12.929627", "step": 108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:12.966009", "step": 108, "epoch": 1 }, { "type": "loss", "content": 0.018806248903274536, "timestamp": "2025-09-30 22:06:12.968948", "step": 109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:13.003576", "step": 109, "epoch": 1 }, { "type": "loss", "content": 0.03055635467171669, "timestamp": "2025-09-30 22:06:13.007649", "step": 110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:13.044541", "step": 110, "epoch": 1 }, { "type": "loss", "content": 0.026181330904364586, "timestamp": "2025-09-30 22:06:13.049180", "step": 111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:13.079905", "step": 111, "epoch": 1 }, { "type": "loss", "content": 0.02378338761627674, "timestamp": "2025-09-30 22:06:13.109191", "step": 112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:13.141543", "step": 112, "epoch": 1 }, { "type": "loss", "content": 0.025588760152459145, "timestamp": "2025-09-30 22:06:13.149020", "step": 113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:13.181795", "step": 113, "epoch": 1 }, { "type": "loss", "content": 0.0226790402084589, "timestamp": "2025-09-30 22:06:13.189465", "step": 114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:13.229374", "step": 114, "epoch": 1 }, { "type": "loss", "content": 0.02701934054493904, "timestamp": "2025-09-30 22:06:13.233135", "step": 115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:13.265186", "step": 115, "epoch": 1 }, { "type": "loss", "content": 0.025871511548757553, "timestamp": "2025-09-30 22:06:13.292187", "step": 116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:13.331077", "step": 116, "epoch": 1 }, { "type": "loss", "content": 0.01798613741993904, "timestamp": "2025-09-30 22:06:13.337599", "step": 117, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:13.985580", "step": 117, "epoch": 1 }, { "type": "pplx", "content": 58057663.45508437, "timestamp": "2025-09-30 22:06:13.988297", "step": 117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:14.024582", "step": 117, "epoch": 1 }, { "type": "loss", "content": 0.023783918470144272, "timestamp": "2025-09-30 22:06:14.034515", "step": 118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:14.074642", "step": 118, "epoch": 1 }, { "type": "loss", "content": 0.023426802828907967, "timestamp": "2025-09-30 22:06:14.079544", "step": 119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 288 ], "flops": 8543129804160 }, "timestamp": "2025-09-30 22:06:14.127430", "step": 119, "epoch": 1 }, { "type": "loss", "content": 0.023924678564071655, "timestamp": "2025-09-30 22:06:14.156372", "step": 120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:14.191732", "step": 120, "epoch": 1 }, { "type": "loss", "content": 0.02505522407591343, "timestamp": "2025-09-30 22:06:14.202667", "step": 121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:14.243795", "step": 121, "epoch": 1 }, { "type": "loss", "content": 0.023241449147462845, "timestamp": "2025-09-30 22:06:14.248981", "step": 122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:14.293503", "step": 122, "epoch": 1 }, { "type": "loss", "content": 0.026728898286819458, "timestamp": "2025-09-30 22:06:14.310532", "step": 123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:14.349036", "step": 123, "epoch": 1 }, { "type": "loss", "content": 0.028911380097270012, "timestamp": "2025-09-30 22:06:14.383960", "step": 124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:14.430695", "step": 124, "epoch": 1 }, { "type": "loss", "content": 0.023794135078787804, "timestamp": "2025-09-30 22:06:14.446784", "step": 125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:14.496583", "step": 125, "epoch": 1 }, { "type": "loss", "content": 0.021708523854613304, "timestamp": "2025-09-30 22:06:14.500936", "step": 126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:14.538234", "step": 126, "epoch": 1 }, { "type": "loss", "content": 0.023822659626603127, "timestamp": "2025-09-30 22:06:14.546642", "step": 127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:14.582032", "step": 127, "epoch": 1 }, { "type": "loss", "content": 0.024587152525782585, "timestamp": "2025-09-30 22:06:14.607227", "step": 128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:14.642587", "step": 128, "epoch": 1 }, { "type": "loss", "content": 0.025723149999976158, "timestamp": "2025-09-30 22:06:14.647641", "step": 129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:06:14.698697", "step": 129, "epoch": 1 }, { "type": "loss", "content": 0.025141611695289612, "timestamp": "2025-09-30 22:06:14.704698", "step": 130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:14.739117", "step": 130, "epoch": 1 }, { "type": "loss", "content": 0.02382255718111992, "timestamp": "2025-09-30 22:06:14.743358", "step": 131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:14.776924", "step": 131, "epoch": 1 }, { "type": "loss", "content": 0.02148338034749031, "timestamp": "2025-09-30 22:06:14.801074", "step": 132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:14.832460", "step": 132, "epoch": 1 }, { "type": "loss", "content": 0.023856794461607933, "timestamp": "2025-09-30 22:06:14.834499", "step": 133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:14.879981", "step": 133, "epoch": 1 }, { "type": "loss", "content": 0.02617805078625679, "timestamp": "2025-09-30 22:06:14.886922", "step": 134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:14.927519", "step": 134, "epoch": 1 }, { "type": "loss", "content": 0.023855309933423996, "timestamp": "2025-09-30 22:06:14.935227", "step": 135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:14.972812", "step": 135, "epoch": 1 }, { "type": "loss", "content": 0.024362707510590553, "timestamp": "2025-09-30 22:06:14.998356", "step": 136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:15.032411", "step": 136, "epoch": 1 }, { "type": "loss", "content": 0.023809295147657394, "timestamp": "2025-09-30 22:06:15.034375", "step": 137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:15.072361", "step": 137, "epoch": 1 }, { "type": "loss", "content": 0.02297171764075756, "timestamp": "2025-09-30 22:06:15.076210", "step": 138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:15.121812", "step": 138, "epoch": 1 }, { "type": "loss", "content": 0.02328100986778736, "timestamp": "2025-09-30 22:06:15.126486", "step": 139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:15.158918", "step": 139, "epoch": 1 }, { "type": "loss", "content": 0.023281242698431015, "timestamp": "2025-09-30 22:06:15.191638", "step": 140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:15.232799", "step": 140, "epoch": 1 }, { "type": "loss", "content": 0.023923074826598167, "timestamp": "2025-09-30 22:06:15.236324", "step": 141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:15.277095", "step": 141, "epoch": 1 }, { "type": "loss", "content": 0.024552708491683006, "timestamp": "2025-09-30 22:06:15.287847", "step": 142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:15.341930", "step": 142, "epoch": 1 }, { "type": "loss", "content": 0.02170448563992977, "timestamp": "2025-09-30 22:06:15.359891", "step": 143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:15.407440", "step": 143, "epoch": 1 }, { "type": "loss", "content": 0.025551343336701393, "timestamp": "2025-09-30 22:06:15.439892", "step": 144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:15.495337", "step": 144, "epoch": 1 }, { "type": "loss", "content": 0.02667604200541973, "timestamp": "2025-09-30 22:06:15.509839", "step": 145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:15.557378", "step": 145, "epoch": 1 }, { "type": "loss", "content": 0.02111595869064331, "timestamp": "2025-09-30 22:06:15.561594", "step": 146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:15.601333", "step": 146, "epoch": 1 }, { "type": "loss", "content": 0.02530730329453945, "timestamp": "2025-09-30 22:06:15.610325", "step": 147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:15.650918", "step": 147, "epoch": 1 }, { "type": "loss", "content": 0.023542964830994606, "timestamp": "2025-09-30 22:06:15.681910", "step": 148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:15.739661", "step": 148, "epoch": 1 }, { "type": "loss", "content": 0.026816150173544884, "timestamp": "2025-09-30 22:06:15.748319", "step": 149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:15.795347", "step": 149, "epoch": 1 }, { "type": "loss", "content": 0.023416031152009964, "timestamp": "2025-09-30 22:06:15.803601", "step": 150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:15.840449", "step": 150, "epoch": 1 }, { "type": "loss", "content": 0.02175022102892399, "timestamp": "2025-09-30 22:06:15.850178", "step": 151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:15.890241", "step": 151, "epoch": 1 }, { "type": "loss", "content": 0.021428557112812996, "timestamp": "2025-09-30 22:06:15.918399", "step": 152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:15.972154", "step": 152, "epoch": 1 }, { "type": "loss", "content": 0.02025659568607807, "timestamp": "2025-09-30 22:06:15.985827", "step": 153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:16.022412", "step": 153, "epoch": 1 }, { "type": "loss", "content": 0.029116351157426834, "timestamp": "2025-09-30 22:06:16.026546", "step": 154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:16.064960", "step": 154, "epoch": 1 }, { "type": "loss", "content": 0.023566367104649544, "timestamp": "2025-09-30 22:06:16.069181", "step": 155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:16.104476", "step": 155, "epoch": 1 }, { "type": "loss", "content": 0.024201836436986923, "timestamp": "2025-09-30 22:06:16.132556", "step": 156, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:16.805291", "step": 156, "epoch": 1 }, { "type": "pplx", "content": 60128195.901556, "timestamp": "2025-09-30 22:06:16.810561", "step": 156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:16.842713", "step": 156, "epoch": 1 }, { "type": "loss", "content": 0.023354709148406982, "timestamp": "2025-09-30 22:06:16.845277", "step": 157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:16.879380", "step": 157, "epoch": 1 }, { "type": "loss", "content": 0.023321080952882767, "timestamp": "2025-09-30 22:06:16.884106", "step": 158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:16.918371", "step": 158, "epoch": 1 }, { "type": "loss", "content": 0.0184608343988657, "timestamp": "2025-09-30 22:06:16.921450", "step": 159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:16.963578", "step": 159, "epoch": 1 }, { "type": "loss", "content": 0.028638625517487526, "timestamp": "2025-09-30 22:06:16.991655", "step": 160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:17.025929", "step": 160, "epoch": 1 }, { "type": "loss", "content": 0.026660356670618057, "timestamp": "2025-09-30 22:06:17.027849", "step": 161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:17.063893", "step": 161, "epoch": 1 }, { "type": "loss", "content": 0.020861739292740822, "timestamp": "2025-09-30 22:06:17.065804", "step": 162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:17.099463", "step": 162, "epoch": 1 }, { "type": "loss", "content": 0.03286191076040268, "timestamp": "2025-09-30 22:06:17.107264", "step": 163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:17.155934", "step": 163, "epoch": 1 }, { "type": "loss", "content": 0.026328887790441513, "timestamp": "2025-09-30 22:06:17.181533", "step": 164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:17.225929", "step": 164, "epoch": 1 }, { "type": "loss", "content": 0.020726049318909645, "timestamp": "2025-09-30 22:06:17.228526", "step": 165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:17.263039", "step": 165, "epoch": 1 }, { "type": "loss", "content": 0.028658464550971985, "timestamp": "2025-09-30 22:06:17.267317", "step": 166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:17.303745", "step": 166, "epoch": 1 }, { "type": "loss", "content": 0.02197512611746788, "timestamp": "2025-09-30 22:06:17.308076", "step": 167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:17.341695", "step": 167, "epoch": 1 }, { "type": "loss", "content": 0.02463226579129696, "timestamp": "2025-09-30 22:06:17.366485", "step": 168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:17.398774", "step": 168, "epoch": 1 }, { "type": "loss", "content": 0.023855552077293396, "timestamp": "2025-09-30 22:06:17.403932", "step": 169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:17.438146", "step": 169, "epoch": 1 }, { "type": "loss", "content": 0.02280578203499317, "timestamp": "2025-09-30 22:06:17.442377", "step": 170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:17.479740", "step": 170, "epoch": 1 }, { "type": "loss", "content": 0.0270631592720747, "timestamp": "2025-09-30 22:06:17.482061", "step": 171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:17.519029", "step": 171, "epoch": 1 }, { "type": "loss", "content": 0.024634188041090965, "timestamp": "2025-09-30 22:06:17.547343", "step": 172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:17.580035", "step": 172, "epoch": 1 }, { "type": "loss", "content": 0.028481189161539078, "timestamp": "2025-09-30 22:06:17.582263", "step": 173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:17.619799", "step": 173, "epoch": 1 }, { "type": "loss", "content": 0.024899035692214966, "timestamp": "2025-09-30 22:06:17.621892", "step": 174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:17.656923", "step": 174, "epoch": 1 }, { "type": "loss", "content": 0.01988835260272026, "timestamp": "2025-09-30 22:06:17.661545", "step": 175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:17.694918", "step": 175, "epoch": 1 }, { "type": "loss", "content": 0.02222885750234127, "timestamp": "2025-09-30 22:06:17.718625", "step": 176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:17.751206", "step": 176, "epoch": 1 }, { "type": "loss", "content": 0.026466725394129753, "timestamp": "2025-09-30 22:06:17.753279", "step": 177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:17.785153", "step": 177, "epoch": 1 }, { "type": "loss", "content": 0.018711643293499947, "timestamp": "2025-09-30 22:06:17.787833", "step": 178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:17.823812", "step": 178, "epoch": 1 }, { "type": "loss", "content": 0.022940317168831825, "timestamp": "2025-09-30 22:06:17.828208", "step": 179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:17.862094", "step": 179, "epoch": 1 }, { "type": "loss", "content": 0.02327939309179783, "timestamp": "2025-09-30 22:06:17.885892", "step": 180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:17.918900", "step": 180, "epoch": 1 }, { "type": "loss", "content": 0.02618366852402687, "timestamp": "2025-09-30 22:06:17.920795", "step": 181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:17.953019", "step": 181, "epoch": 1 }, { "type": "loss", "content": 0.026767287403345108, "timestamp": "2025-09-30 22:06:17.960637", "step": 182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:17.995007", "step": 182, "epoch": 1 }, { "type": "loss", "content": 0.02279074303805828, "timestamp": "2025-09-30 22:06:17.997057", "step": 183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:18.034141", "step": 183, "epoch": 1 }, { "type": "loss", "content": 0.023376479744911194, "timestamp": "2025-09-30 22:06:18.058008", "step": 184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:18.089115", "step": 184, "epoch": 1 }, { "type": "loss", "content": 0.023480886593461037, "timestamp": "2025-09-30 22:06:18.091595", "step": 185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:18.123826", "step": 185, "epoch": 1 }, { "type": "loss", "content": 0.024746397510170937, "timestamp": "2025-09-30 22:06:18.128096", "step": 186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:18.164361", "step": 186, "epoch": 1 }, { "type": "loss", "content": 0.026330998167395592, "timestamp": "2025-09-30 22:06:18.166292", "step": 187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:18.198316", "step": 187, "epoch": 1 }, { "type": "loss", "content": 0.024767503142356873, "timestamp": "2025-09-30 22:06:18.226573", "step": 188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:18.261000", "step": 188, "epoch": 1 }, { "type": "loss", "content": 0.0204471405595541, "timestamp": "2025-09-30 22:06:18.263280", "step": 189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:18.294801", "step": 189, "epoch": 1 }, { "type": "loss", "content": 0.022645656019449234, "timestamp": "2025-09-30 22:06:18.299066", "step": 190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:18.331507", "step": 190, "epoch": 1 }, { "type": "loss", "content": 0.026833781972527504, "timestamp": "2025-09-30 22:06:18.333483", "step": 191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:18.367209", "step": 191, "epoch": 1 }, { "type": "loss", "content": 0.023763876408338547, "timestamp": "2025-09-30 22:06:18.390938", "step": 192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:18.421801", "step": 192, "epoch": 1 }, { "type": "loss", "content": 0.02496214583516121, "timestamp": "2025-09-30 22:06:18.423787", "step": 193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:18.456539", "step": 193, "epoch": 1 }, { "type": "loss", "content": 0.022200098261237144, "timestamp": "2025-09-30 22:06:18.460592", "step": 194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:18.491440", "step": 194, "epoch": 1 }, { "type": "loss", "content": 0.0215250663459301, "timestamp": "2025-09-30 22:06:18.494339", "step": 195, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:19.146650", "step": 195, "epoch": 1 }, { "type": "pplx", "content": 61265013.73487314, "timestamp": "2025-09-30 22:06:19.149720", "step": 195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:19.182883", "step": 195, "epoch": 1 }, { "type": "loss", "content": 0.023729149252176285, "timestamp": "2025-09-30 22:06:19.208362", "step": 196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:19.248428", "step": 196, "epoch": 1 }, { "type": "loss", "content": 0.021181587129831314, "timestamp": "2025-09-30 22:06:19.250925", "step": 197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:19.283508", "step": 197, "epoch": 1 }, { "type": "loss", "content": 0.022421227768063545, "timestamp": "2025-09-30 22:06:19.286159", "step": 198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:19.318784", "step": 198, "epoch": 1 }, { "type": "loss", "content": 0.02127877064049244, "timestamp": "2025-09-30 22:06:19.326066", "step": 199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:19.360335", "step": 199, "epoch": 1 }, { "type": "loss", "content": 0.02680543251335621, "timestamp": "2025-09-30 22:06:19.388112", "step": 200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:19.424365", "step": 200, "epoch": 1 }, { "type": "loss", "content": 0.021091178059577942, "timestamp": "2025-09-30 22:06:19.438368", "step": 201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:19.481803", "step": 201, "epoch": 1 }, { "type": "loss", "content": 0.024247558787465096, "timestamp": "2025-09-30 22:06:19.486717", "step": 202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:19.520237", "step": 202, "epoch": 1 }, { "type": "loss", "content": 0.021932261064648628, "timestamp": "2025-09-30 22:06:19.533549", "step": 203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:19.579316", "step": 203, "epoch": 1 }, { "type": "loss", "content": 0.025690482929348946, "timestamp": "2025-09-30 22:06:19.604139", "step": 204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:19.652713", "step": 204, "epoch": 1 }, { "type": "loss", "content": 0.028097007423639297, "timestamp": "2025-09-30 22:06:19.657320", "step": 205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:19.694961", "step": 205, "epoch": 1 }, { "type": "loss", "content": 0.023479444906115532, "timestamp": "2025-09-30 22:06:19.709008", "step": 206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:19.768686", "step": 206, "epoch": 1 }, { "type": "loss", "content": 0.018325267359614372, "timestamp": "2025-09-30 22:06:19.776601", "step": 207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:19.815261", "step": 207, "epoch": 1 }, { "type": "loss", "content": 0.024427268654108047, "timestamp": "2025-09-30 22:06:19.843859", "step": 208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:19.882618", "step": 208, "epoch": 1 }, { "type": "loss", "content": 0.02468344010412693, "timestamp": "2025-09-30 22:06:19.886118", "step": 209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:19.925200", "step": 209, "epoch": 1 }, { "type": "loss", "content": 0.028634795919060707, "timestamp": "2025-09-30 22:06:19.927329", "step": 210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:19.968803", "step": 210, "epoch": 1 }, { "type": "loss", "content": 0.026183156296610832, "timestamp": "2025-09-30 22:06:19.975598", "step": 211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:20.010821", "step": 211, "epoch": 1 }, { "type": "loss", "content": 0.01938895508646965, "timestamp": "2025-09-30 22:06:20.035584", "step": 212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:20.074552", "step": 212, "epoch": 1 }, { "type": "loss", "content": 0.023146267980337143, "timestamp": "2025-09-30 22:06:20.079433", "step": 213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:20.112483", "step": 213, "epoch": 1 }, { "type": "loss", "content": 0.030951758846640587, "timestamp": "2025-09-30 22:06:20.119834", "step": 214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:20.151526", "step": 214, "epoch": 1 }, { "type": "loss", "content": 0.017646988853812218, "timestamp": "2025-09-30 22:06:20.156564", "step": 215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:20.191678", "step": 215, "epoch": 1 }, { "type": "loss", "content": 0.020599817857146263, "timestamp": "2025-09-30 22:06:20.221417", "step": 216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:20.256856", "step": 216, "epoch": 1 }, { "type": "loss", "content": 0.02311067096889019, "timestamp": "2025-09-30 22:06:20.265333", "step": 217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:20.301971", "step": 217, "epoch": 1 }, { "type": "loss", "content": 0.018607063218951225, "timestamp": "2025-09-30 22:06:20.310012", "step": 218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:20.350535", "step": 218, "epoch": 1 }, { "type": "loss", "content": 0.019698362797498703, "timestamp": "2025-09-30 22:06:20.358037", "step": 219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:20.402403", "step": 219, "epoch": 1 }, { "type": "loss", "content": 0.02162352204322815, "timestamp": "2025-09-30 22:06:20.431291", "step": 220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:20.474981", "step": 220, "epoch": 1 }, { "type": "loss", "content": 0.02282586693763733, "timestamp": "2025-09-30 22:06:20.482606", "step": 221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:06:20.517739", "step": 221, "epoch": 1 }, { "type": "loss", "content": 0.02331475540995598, "timestamp": "2025-09-30 22:06:20.528010", "step": 222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:20.561612", "step": 222, "epoch": 1 }, { "type": "loss", "content": 0.022115275263786316, "timestamp": "2025-09-30 22:06:20.569022", "step": 223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:20.605957", "step": 223, "epoch": 1 }, { "type": "loss", "content": 0.023793192580342293, "timestamp": "2025-09-30 22:06:20.634937", "step": 224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:20.666302", "step": 224, "epoch": 1 }, { "type": "loss", "content": 0.025844072923064232, "timestamp": "2025-09-30 22:06:20.673036", "step": 225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:20.710131", "step": 225, "epoch": 1 }, { "type": "loss", "content": 0.03035455010831356, "timestamp": "2025-09-30 22:06:20.718027", "step": 226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:20.753996", "step": 226, "epoch": 1 }, { "type": "loss", "content": 0.023812538012862206, "timestamp": "2025-09-30 22:06:20.759884", "step": 227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:20.793425", "step": 227, "epoch": 1 }, { "type": "loss", "content": 0.023976163938641548, "timestamp": "2025-09-30 22:06:20.819722", "step": 228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:20.860481", "step": 228, "epoch": 1 }, { "type": "loss", "content": 0.015135176479816437, "timestamp": "2025-09-30 22:06:20.875475", "step": 229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:20.918279", "step": 229, "epoch": 1 }, { "type": "loss", "content": 0.020723579451441765, "timestamp": "2025-09-30 22:06:20.921030", "step": 230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:20.954749", "step": 230, "epoch": 1 }, { "type": "loss", "content": 0.020386500284075737, "timestamp": "2025-09-30 22:06:20.962096", "step": 231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:21.000099", "step": 231, "epoch": 1 }, { "type": "loss", "content": 0.01781288906931877, "timestamp": "2025-09-30 22:06:21.029073", "step": 232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:21.072306", "step": 232, "epoch": 1 }, { "type": "loss", "content": 0.01598362810909748, "timestamp": "2025-09-30 22:06:21.083441", "step": 233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:21.115580", "step": 233, "epoch": 1 }, { "type": "loss", "content": 0.027275921776890755, "timestamp": "2025-09-30 22:06:21.122328", "step": 234, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:21.782110", "step": 234, "epoch": 1 }, { "type": "pplx", "content": 64263270.81331823, "timestamp": "2025-09-30 22:06:21.790075", "step": 234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:21.827322", "step": 234, "epoch": 1 }, { "type": "loss", "content": 0.025542790070176125, "timestamp": "2025-09-30 22:06:21.829775", "step": 235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:21.870541", "step": 235, "epoch": 1 }, { "type": "loss", "content": 0.017988238483667374, "timestamp": "2025-09-30 22:06:21.899521", "step": 236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:21.942330", "step": 236, "epoch": 1 }, { "type": "loss", "content": 0.03205994889140129, "timestamp": "2025-09-30 22:06:21.948012", "step": 237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:21.986669", "step": 237, "epoch": 1 }, { "type": "loss", "content": 0.01907184161245823, "timestamp": "2025-09-30 22:06:21.991617", "step": 238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:22.025386", "step": 238, "epoch": 1 }, { "type": "loss", "content": 0.017065171152353287, "timestamp": "2025-09-30 22:06:22.028018", "step": 239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:22.064478", "step": 239, "epoch": 1 }, { "type": "loss", "content": 0.023361805826425552, "timestamp": "2025-09-30 22:06:22.090095", "step": 240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:22.125961", "step": 240, "epoch": 1 }, { "type": "loss", "content": 0.025280656293034554, "timestamp": "2025-09-30 22:06:22.134904", "step": 241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:22.171375", "step": 241, "epoch": 1 }, { "type": "loss", "content": 0.015582531690597534, "timestamp": "2025-09-30 22:06:22.179252", "step": 242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:22.215546", "step": 242, "epoch": 1 }, { "type": "loss", "content": 0.021414535120129585, "timestamp": "2025-09-30 22:06:22.222422", "step": 243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:22.258698", "step": 243, "epoch": 1 }, { "type": "loss", "content": 0.023022836074233055, "timestamp": "2025-09-30 22:06:22.287682", "step": 244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:22.320455", "step": 244, "epoch": 1 }, { "type": "loss", "content": 0.018914300948381424, "timestamp": "2025-09-30 22:06:22.325263", "step": 245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:22.359000", "step": 245, "epoch": 1 }, { "type": "loss", "content": 0.027963733300566673, "timestamp": "2025-09-30 22:06:22.366766", "step": 246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:22.404691", "step": 246, "epoch": 1 }, { "type": "loss", "content": 0.01878245547413826, "timestamp": "2025-09-30 22:06:22.409119", "step": 247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:22.456732", "step": 247, "epoch": 1 }, { "type": "loss", "content": 0.023027878254652023, "timestamp": "2025-09-30 22:06:22.486436", "step": 248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:22.524432", "step": 248, "epoch": 1 }, { "type": "loss", "content": 0.019469697028398514, "timestamp": "2025-09-30 22:06:22.531177", "step": 249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:22.578306", "step": 249, "epoch": 1 }, { "type": "loss", "content": 0.018262622877955437, "timestamp": "2025-09-30 22:06:22.581410", "step": 250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:22.615104", "step": 250, "epoch": 1 }, { "type": "loss", "content": 0.0221934225410223, "timestamp": "2025-09-30 22:06:22.622786", "step": 251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:22.656684", "step": 251, "epoch": 1 }, { "type": "loss", "content": 0.01658940128982067, "timestamp": "2025-09-30 22:06:22.687329", "step": 252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:22.729072", "step": 252, "epoch": 1 }, { "type": "loss", "content": 0.01807001605629921, "timestamp": "2025-09-30 22:06:22.736230", "step": 253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:22.774577", "step": 253, "epoch": 1 }, { "type": "loss", "content": 0.021754702553153038, "timestamp": "2025-09-30 22:06:22.782602", "step": 254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:22.820114", "step": 254, "epoch": 1 }, { "type": "loss", "content": 0.026683764532208443, "timestamp": "2025-09-30 22:06:22.827270", "step": 255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:22.865345", "step": 255, "epoch": 1 }, { "type": "loss", "content": 0.013391956686973572, "timestamp": "2025-09-30 22:06:22.896128", "step": 256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:22.936522", "step": 256, "epoch": 1 }, { "type": "loss", "content": 0.017003720626235008, "timestamp": "2025-09-30 22:06:22.945242", "step": 257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:22.986133", "step": 257, "epoch": 1 }, { "type": "loss", "content": 0.00964332651346922, "timestamp": "2025-09-30 22:06:22.996079", "step": 258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:23.032923", "step": 258, "epoch": 1 }, { "type": "loss", "content": 0.009957768023014069, "timestamp": "2025-09-30 22:06:23.042631", "step": 259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:23.079157", "step": 259, "epoch": 1 }, { "type": "loss", "content": 0.017908958718180656, "timestamp": "2025-09-30 22:06:23.110604", "step": 260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:23.146484", "step": 260, "epoch": 1 }, { "type": "loss", "content": 0.018779877573251724, "timestamp": "2025-09-30 22:06:23.157800", "step": 261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:23.197483", "step": 261, "epoch": 1 }, { "type": "loss", "content": 0.013146973215043545, "timestamp": "2025-09-30 22:06:23.208174", "step": 262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:23.243232", "step": 262, "epoch": 1 }, { "type": "loss", "content": 0.012784031219780445, "timestamp": "2025-09-30 22:06:23.254286", "step": 263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:23.294930", "step": 263, "epoch": 1 }, { "type": "loss", "content": 0.026621146127581596, "timestamp": "2025-09-30 22:06:23.322677", "step": 264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:23.355055", "step": 264, "epoch": 1 }, { "type": "loss", "content": 0.015577270649373531, "timestamp": "2025-09-30 22:06:23.363287", "step": 265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:23.395307", "step": 265, "epoch": 1 }, { "type": "loss", "content": 0.023295968770980835, "timestamp": "2025-09-30 22:06:23.403246", "step": 266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:23.440336", "step": 266, "epoch": 1 }, { "type": "loss", "content": 0.004181277472525835, "timestamp": "2025-09-30 22:06:23.450464", "step": 267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:23.492255", "step": 267, "epoch": 1 }, { "type": "loss", "content": 0.02283737063407898, "timestamp": "2025-09-30 22:06:23.520986", "step": 268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:23.559469", "step": 268, "epoch": 1 }, { "type": "loss", "content": 0.027866655960679054, "timestamp": "2025-09-30 22:06:23.565152", "step": 269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:23.603471", "step": 269, "epoch": 1 }, { "type": "loss", "content": 0.017093496397137642, "timestamp": "2025-09-30 22:06:23.610790", "step": 270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:23.651049", "step": 270, "epoch": 1 }, { "type": "loss", "content": 0.024166589602828026, "timestamp": "2025-09-30 22:06:23.659249", "step": 271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:23.696804", "step": 271, "epoch": 1 }, { "type": "loss", "content": 0.026155179366469383, "timestamp": "2025-09-30 22:06:23.727858", "step": 272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:23.767971", "step": 272, "epoch": 1 }, { "type": "loss", "content": 0.024930324405431747, "timestamp": "2025-09-30 22:06:23.773031", "step": 273, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:24.396835", "step": 273, "epoch": 1 }, { "type": "pplx", "content": 67738689.07516396, "timestamp": "2025-09-30 22:06:24.406333", "step": 273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:24.441918", "step": 273, "epoch": 1 }, { "type": "loss", "content": 0.010441062971949577, "timestamp": "2025-09-30 22:06:24.463419", "step": 274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:24.496375", "step": 274, "epoch": 1 }, { "type": "loss", "content": 0.023288477212190628, "timestamp": "2025-09-30 22:06:24.503290", "step": 275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:06:24.541145", "step": 275, "epoch": 1 }, { "type": "loss", "content": 0.01126602478325367, "timestamp": "2025-09-30 22:06:24.573099", "step": 276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:24.614461", "step": 276, "epoch": 1 }, { "type": "loss", "content": 0.019958725199103355, "timestamp": "2025-09-30 22:06:24.617351", "step": 277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:24.652885", "step": 277, "epoch": 1 }, { "type": "loss", "content": 0.013975156471133232, "timestamp": "2025-09-30 22:06:24.665078", "step": 278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:24.709700", "step": 278, "epoch": 1 }, { "type": "loss", "content": 0.030132178217172623, "timestamp": "2025-09-30 22:06:24.718532", "step": 279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:24.757773", "step": 279, "epoch": 1 }, { "type": "loss", "content": 0.012328540906310081, "timestamp": "2025-09-30 22:06:24.782673", "step": 280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:24.825528", "step": 280, "epoch": 1 }, { "type": "loss", "content": 0.027462830767035484, "timestamp": "2025-09-30 22:06:24.835370", "step": 281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:24.876083", "step": 281, "epoch": 1 }, { "type": "loss", "content": 0.01589822955429554, "timestamp": "2025-09-30 22:06:24.885679", "step": 282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:24.923440", "step": 282, "epoch": 1 }, { "type": "loss", "content": 0.030031440779566765, "timestamp": "2025-09-30 22:06:24.931219", "step": 283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-30 22:06:24.982462", "step": 283, "epoch": 1 }, { "type": "loss", "content": 0.025346266105771065, "timestamp": "2025-09-30 22:06:25.010712", "step": 284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:25.050615", "step": 284, "epoch": 1 }, { "type": "loss", "content": 0.02812729962170124, "timestamp": "2025-09-30 22:06:25.060034", "step": 285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:25.096299", "step": 285, "epoch": 1 }, { "type": "loss", "content": 0.0242477897554636, "timestamp": "2025-09-30 22:06:25.104222", "step": 286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:25.142717", "step": 286, "epoch": 1 }, { "type": "loss", "content": 0.018631068989634514, "timestamp": "2025-09-30 22:06:25.149525", "step": 287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:25.192273", "step": 287, "epoch": 1 }, { "type": "loss", "content": 0.009550183080136776, "timestamp": "2025-09-30 22:06:25.217564", "step": 288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:25.249899", "step": 288, "epoch": 1 }, { "type": "loss", "content": 0.003718113526701927, "timestamp": "2025-09-30 22:06:25.259578", "step": 289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:25.303569", "step": 289, "epoch": 1 }, { "type": "loss", "content": 0.020078999921679497, "timestamp": "2025-09-30 22:06:25.311292", "step": 290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:25.350162", "step": 290, "epoch": 1 }, { "type": "loss", "content": 0.022243574261665344, "timestamp": "2025-09-30 22:06:25.359780", "step": 291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:25.400489", "step": 291, "epoch": 1 }, { "type": "loss", "content": 0.014406189322471619, "timestamp": "2025-09-30 22:06:25.430318", "step": 292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:25.468538", "step": 292, "epoch": 1 }, { "type": "loss", "content": 0.0135045712813735, "timestamp": "2025-09-30 22:06:25.471073", "step": 293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:25.503612", "step": 293, "epoch": 1 }, { "type": "loss", "content": 0.009503084234893322, "timestamp": "2025-09-30 22:06:25.512131", "step": 294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:25.548039", "step": 294, "epoch": 1 }, { "type": "loss", "content": 0.010631188750267029, "timestamp": "2025-09-30 22:06:25.555157", "step": 295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:25.595185", "step": 295, "epoch": 1 }, { "type": "loss", "content": 0.025840530171990395, "timestamp": "2025-09-30 22:06:25.620041", "step": 296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:25.652551", "step": 296, "epoch": 1 }, { "type": "loss", "content": 0.005571034271270037, "timestamp": "2025-09-30 22:06:25.664110", "step": 297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:25.703265", "step": 297, "epoch": 1 }, { "type": "loss", "content": 0.03459536284208298, "timestamp": "2025-09-30 22:06:25.710008", "step": 298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:25.746808", "step": 298, "epoch": 1 }, { "type": "loss", "content": 0.013033518567681313, "timestamp": "2025-09-30 22:06:25.755910", "step": 299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:25.794091", "step": 299, "epoch": 1 }, { "type": "loss", "content": 0.025730684399604797, "timestamp": "2025-09-30 22:06:25.822479", "step": 300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:25.856938", "step": 300, "epoch": 1 }, { "type": "loss", "content": 0.015336094424128532, "timestamp": "2025-09-30 22:06:25.868770", "step": 301, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:25.909828", "step": 301, "epoch": 1 }, { "type": "loss", "content": 0.047021035104990005, "timestamp": "2025-09-30 22:06:25.914186", "step": 302, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:25.952032", "step": 302, "epoch": 1 }, { "type": "loss", "content": 0.004503812175244093, "timestamp": "2025-09-30 22:06:25.956650", "step": 303, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:25.993051", "step": 303, "epoch": 1 }, { "type": "loss", "content": 0.005192614626139402, "timestamp": "2025-09-30 22:06:26.022878", "step": 304, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:26.053400", "step": 304, "epoch": 1 }, { "type": "loss", "content": 0.03500426560640335, "timestamp": "2025-09-30 22:06:26.061522", "step": 305, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:26.105787", "step": 305, "epoch": 1 }, { "type": "loss", "content": 0.011063890531659126, "timestamp": "2025-09-30 22:06:26.114338", "step": 306, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:26.152122", "step": 306, "epoch": 1 }, { "type": "loss", "content": 0.0441848523914814, "timestamp": "2025-09-30 22:06:26.159235", "step": 307, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:26.194940", "step": 307, "epoch": 1 }, { "type": "loss", "content": 0.013916602358222008, "timestamp": "2025-09-30 22:06:26.223115", "step": 308, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:26.255058", "step": 308, "epoch": 1 }, { "type": "loss", "content": 0.010168136097490788, "timestamp": "2025-09-30 22:06:26.260774", "step": 309, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:26.294159", "step": 309, "epoch": 1 }, { "type": "loss", "content": 0.004245879594236612, "timestamp": "2025-09-30 22:06:26.303522", "step": 310, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:26.338332", "step": 310, "epoch": 1 }, { "type": "loss", "content": 0.007543101906776428, "timestamp": "2025-09-30 22:06:26.345524", "step": 311, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:26.385370", "step": 311, "epoch": 1 }, { "type": "loss", "content": 0.0123042743653059, "timestamp": "2025-09-30 22:06:26.412954", "step": 312, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:27.028571", "step": 312, "epoch": 1 }, { "type": "pplx", "content": 80127076.99242038, "timestamp": "2025-09-30 22:06:27.043101", "step": 312, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:27.087289", "step": 312, "epoch": 1 }, { "type": "loss", "content": 0.025003399699926376, "timestamp": "2025-09-30 22:06:27.100523", "step": 313, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:27.142075", "step": 313, "epoch": 1 }, { "type": "loss", "content": 0.03449965640902519, "timestamp": "2025-09-30 22:06:27.150373", "step": 314, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:27.186771", "step": 314, "epoch": 1 }, { "type": "loss", "content": 0.015554225072264671, "timestamp": "2025-09-30 22:06:27.198131", "step": 315, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:27.235703", "step": 315, "epoch": 1 }, { "type": "loss", "content": 0.002846226328983903, "timestamp": "2025-09-30 22:06:27.267432", "step": 316, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:27.303449", "step": 316, "epoch": 1 }, { "type": "loss", "content": 0.0057856314815580845, "timestamp": "2025-09-30 22:06:27.315891", "step": 317, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:27.347591", "step": 317, "epoch": 1 }, { "type": "loss", "content": 0.006301513407379389, "timestamp": "2025-09-30 22:06:27.356087", "step": 318, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:27.392323", "step": 318, "epoch": 1 }, { "type": "loss", "content": 0.027794986963272095, "timestamp": "2025-09-30 22:06:27.395540", "step": 319, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:27.427051", "step": 319, "epoch": 1 }, { "type": "loss", "content": 0.03337537869811058, "timestamp": "2025-09-30 22:06:27.452922", "step": 320, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:27.489746", "step": 320, "epoch": 1 }, { "type": "loss", "content": 0.0039304569363594055, "timestamp": "2025-09-30 22:06:27.494950", "step": 321, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:27.531296", "step": 321, "epoch": 1 }, { "type": "loss", "content": 0.009028865024447441, "timestamp": "2025-09-30 22:06:27.539041", "step": 322, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:27.572205", "step": 322, "epoch": 1 }, { "type": "loss", "content": 0.004246709402650595, "timestamp": "2025-09-30 22:06:27.576856", "step": 323, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:27.609544", "step": 323, "epoch": 1 }, { "type": "loss", "content": 0.00866038165986538, "timestamp": "2025-09-30 22:06:27.634664", "step": 324, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:27.667286", "step": 324, "epoch": 1 }, { "type": "loss", "content": 0.0015846890164539218, "timestamp": "2025-09-30 22:06:27.675799", "step": 325, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:27.713112", "step": 325, "epoch": 1 }, { "type": "loss", "content": 0.03486005961894989, "timestamp": "2025-09-30 22:06:27.720983", "step": 326, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:27.756875", "step": 326, "epoch": 1 }, { "type": "loss", "content": 0.0034004233311861753, "timestamp": "2025-09-30 22:06:27.759550", "step": 327, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:27.791582", "step": 327, "epoch": 1 }, { "type": "loss", "content": 0.007430666591972113, "timestamp": "2025-09-30 22:06:27.817271", "step": 328, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:27.856464", "step": 328, "epoch": 1 }, { "type": "loss", "content": 0.014357471838593483, "timestamp": "2025-09-30 22:06:27.862786", "step": 329, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:27.894317", "step": 329, "epoch": 1 }, { "type": "loss", "content": 0.02626877650618553, "timestamp": "2025-09-30 22:06:27.905202", "step": 330, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:27.947148", "step": 330, "epoch": 1 }, { "type": "loss", "content": 0.016307147219777107, "timestamp": "2025-09-30 22:06:27.951155", "step": 331, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:27.986537", "step": 331, "epoch": 1 }, { "type": "loss", "content": 0.01717582531273365, "timestamp": "2025-09-30 22:06:28.017852", "step": 332, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:28.049610", "step": 332, "epoch": 1 }, { "type": "loss", "content": 0.0503578707575798, "timestamp": "2025-09-30 22:06:28.054474", "step": 333, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:28.092300", "step": 333, "epoch": 1 }, { "type": "loss", "content": 0.02990400418639183, "timestamp": "2025-09-30 22:06:28.096095", "step": 334, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:28.134615", "step": 334, "epoch": 1 }, { "type": "loss", "content": 0.010535559616982937, "timestamp": "2025-09-30 22:06:28.146207", "step": 335, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:28.184003", "step": 335, "epoch": 1 }, { "type": "loss", "content": 0.022340644150972366, "timestamp": "2025-09-30 22:06:28.212849", "step": 336, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:28.248764", "step": 336, "epoch": 1 }, { "type": "loss", "content": 0.0386832021176815, "timestamp": "2025-09-30 22:06:28.259287", "step": 337, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:28.296838", "step": 337, "epoch": 1 }, { "type": "loss", "content": 0.019745545461773872, "timestamp": "2025-09-30 22:06:28.309734", "step": 338, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:28.340946", "step": 338, "epoch": 1 }, { "type": "loss", "content": 0.04619959369301796, "timestamp": "2025-09-30 22:06:28.349895", "step": 339, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:28.388222", "step": 339, "epoch": 1 }, { "type": "loss", "content": 0.02208585850894451, "timestamp": "2025-09-30 22:06:28.412970", "step": 340, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:28.451940", "step": 340, "epoch": 1 }, { "type": "loss", "content": 0.027847709134221077, "timestamp": "2025-09-30 22:06:28.456897", "step": 341, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:28.488586", "step": 341, "epoch": 1 }, { "type": "loss", "content": 0.038464076817035675, "timestamp": "2025-09-30 22:06:28.495274", "step": 342, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:28.526226", "step": 342, "epoch": 1 }, { "type": "loss", "content": 0.006711141671985388, "timestamp": "2025-09-30 22:06:28.530589", "step": 343, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:28.568572", "step": 343, "epoch": 1 }, { "type": "loss", "content": 0.018281709402799606, "timestamp": "2025-09-30 22:06:28.597656", "step": 344, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:28.636348", "step": 344, "epoch": 1 }, { "type": "loss", "content": 0.006692342925816774, "timestamp": "2025-09-30 22:06:28.646848", "step": 345, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:28.680244", "step": 345, "epoch": 1 }, { "type": "loss", "content": 0.010647530667483807, "timestamp": "2025-09-30 22:06:28.684295", "step": 346, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:28.719171", "step": 346, "epoch": 1 }, { "type": "loss", "content": 0.009429781697690487, "timestamp": "2025-09-30 22:06:28.726414", "step": 347, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:28.762319", "step": 347, "epoch": 1 }, { "type": "loss", "content": 0.008969319984316826, "timestamp": "2025-09-30 22:06:28.787958", "step": 348, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:28.821492", "step": 348, "epoch": 1 }, { "type": "loss", "content": 0.025091560557484627, "timestamp": "2025-09-30 22:06:28.823676", "step": 349, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:28.855306", "step": 349, "epoch": 1 }, { "type": "loss", "content": 0.018482675775885582, "timestamp": "2025-09-30 22:06:28.862531", "step": 350, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:28.905175", "step": 350, "epoch": 1 }, { "type": "loss", "content": 0.013928530737757683, "timestamp": "2025-09-30 22:06:28.910346", "step": 351, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:29.567369", "step": 351, "epoch": 1 }, { "type": "pplx", "content": 82378214.0221123, "timestamp": "2025-09-30 22:06:29.573742", "step": 351, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:29.606386", "step": 351, "epoch": 1 }, { "type": "loss", "content": 0.023360390216112137, "timestamp": "2025-09-30 22:06:29.634118", "step": 352, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:29.668425", "step": 352, "epoch": 1 }, { "type": "loss", "content": 0.04514763504266739, "timestamp": "2025-09-30 22:06:29.672546", "step": 353, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:29.711007", "step": 353, "epoch": 1 }, { "type": "loss", "content": 0.019704774022102356, "timestamp": "2025-09-30 22:06:29.718836", "step": 354, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:29.758574", "step": 354, "epoch": 1 }, { "type": "loss", "content": 0.01630558632314205, "timestamp": "2025-09-30 22:06:29.769253", "step": 355, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:29.812347", "step": 355, "epoch": 1 }, { "type": "loss", "content": 0.021960977464914322, "timestamp": "2025-09-30 22:06:29.843635", "step": 356, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:29.884019", "step": 356, "epoch": 1 }, { "type": "loss", "content": 0.022443512454628944, "timestamp": "2025-09-30 22:06:29.888141", "step": 357, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:29.928405", "step": 357, "epoch": 1 }, { "type": "loss", "content": 0.023133354261517525, "timestamp": "2025-09-30 22:06:29.931560", "step": 358, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:29.967827", "step": 358, "epoch": 1 }, { "type": "loss", "content": 0.013051935471594334, "timestamp": "2025-09-30 22:06:29.971395", "step": 359, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:30.014557", "step": 359, "epoch": 1 }, { "type": "loss", "content": 0.011638335883617401, "timestamp": "2025-09-30 22:06:30.043593", "step": 360, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:30.084777", "step": 360, "epoch": 1 }, { "type": "loss", "content": 0.01069671381264925, "timestamp": "2025-09-30 22:06:30.093341", "step": 361, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:30.133700", "step": 361, "epoch": 1 }, { "type": "loss", "content": 0.020872537046670914, "timestamp": "2025-09-30 22:06:30.138123", "step": 362, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:30.180226", "step": 362, "epoch": 1 }, { "type": "loss", "content": 0.01966075785458088, "timestamp": "2025-09-30 22:06:30.188724", "step": 363, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:30.227767", "step": 363, "epoch": 1 }, { "type": "loss", "content": 0.024045193567872047, "timestamp": "2025-09-30 22:06:30.256400", "step": 364, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:30.297037", "step": 364, "epoch": 1 }, { "type": "loss", "content": 0.011192423291504383, "timestamp": "2025-09-30 22:06:30.300657", "step": 365, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:30.341894", "step": 365, "epoch": 1 }, { "type": "loss", "content": 0.009517804719507694, "timestamp": "2025-09-30 22:06:30.344602", "step": 366, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:30.382839", "step": 366, "epoch": 1 }, { "type": "loss", "content": 0.011407798156142235, "timestamp": "2025-09-30 22:06:30.390653", "step": 367, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:30.424011", "step": 367, "epoch": 1 }, { "type": "loss", "content": 0.025280386209487915, "timestamp": "2025-09-30 22:06:30.455338", "step": 368, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:30.502843", "step": 368, "epoch": 1 }, { "type": "loss", "content": 0.011698196642100811, "timestamp": "2025-09-30 22:06:30.510007", "step": 369, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:30.562476", "step": 369, "epoch": 1 }, { "type": "loss", "content": 0.014684346504509449, "timestamp": "2025-09-30 22:06:30.567961", "step": 370, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:30.602348", "step": 370, "epoch": 1 }, { "type": "loss", "content": 0.022555148229002953, "timestamp": "2025-09-30 22:06:30.609767", "step": 371, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:30.647893", "step": 371, "epoch": 1 }, { "type": "loss", "content": 0.020193180069327354, "timestamp": "2025-09-30 22:06:30.678853", "step": 372, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:30.714691", "step": 372, "epoch": 1 }, { "type": "loss", "content": 0.014575188979506493, "timestamp": "2025-09-30 22:06:30.717267", "step": 373, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:30.755895", "step": 373, "epoch": 1 }, { "type": "loss", "content": 0.011546963825821877, "timestamp": "2025-09-30 22:06:30.766148", "step": 374, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:30.816703", "step": 374, "epoch": 1 }, { "type": "loss", "content": 0.018510419875383377, "timestamp": "2025-09-30 22:06:30.825663", "step": 375, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:30.859042", "step": 375, "epoch": 1 }, { "type": "loss", "content": 0.02678140625357628, "timestamp": "2025-09-30 22:06:30.888898", "step": 376, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:30.928626", "step": 376, "epoch": 1 }, { "type": "loss", "content": 0.014265132136642933, "timestamp": "2025-09-30 22:06:30.932308", "step": 377, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:30.970600", "step": 377, "epoch": 1 }, { "type": "loss", "content": 0.012481029145419598, "timestamp": "2025-09-30 22:06:30.978508", "step": 378, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:31.019119", "step": 378, "epoch": 1 }, { "type": "loss", "content": 0.025546716526150703, "timestamp": "2025-09-30 22:06:31.023547", "step": 379, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:31.057247", "step": 379, "epoch": 1 }, { "type": "loss", "content": 0.01196589320898056, "timestamp": "2025-09-30 22:06:31.082411", "step": 380, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:31.120796", "step": 380, "epoch": 1 }, { "type": "loss", "content": 0.011724757961928844, "timestamp": "2025-09-30 22:06:31.129210", "step": 381, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:31.166134", "step": 381, "epoch": 1 }, { "type": "loss", "content": 0.019267279654741287, "timestamp": "2025-09-30 22:06:31.173179", "step": 382, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:31.210994", "step": 382, "epoch": 1 }, { "type": "loss", "content": 0.024116748943924904, "timestamp": "2025-09-30 22:06:31.218079", "step": 383, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:31.249965", "step": 383, "epoch": 1 }, { "type": "loss", "content": 0.013756215572357178, "timestamp": "2025-09-30 22:06:31.277885", "step": 384, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:31.313944", "step": 384, "epoch": 1 }, { "type": "loss", "content": 0.011417214758694172, "timestamp": "2025-09-30 22:06:31.320095", "step": 385, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:31.354415", "step": 385, "epoch": 1 }, { "type": "loss", "content": 0.020715000107884407, "timestamp": "2025-09-30 22:06:31.362141", "step": 386, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:31.395711", "step": 386, "epoch": 1 }, { "type": "loss", "content": 0.007579857017844915, "timestamp": "2025-09-30 22:06:31.398574", "step": 387, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:31.429871", "step": 387, "epoch": 1 }, { "type": "loss", "content": 0.024890294298529625, "timestamp": "2025-09-30 22:06:31.453667", "step": 388, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:31.485946", "step": 388, "epoch": 1 }, { "type": "loss", "content": 0.013178860768675804, "timestamp": "2025-09-30 22:06:31.488895", "step": 389, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:31.520814", "step": 389, "epoch": 1 }, { "type": "loss", "content": 0.012439219281077385, "timestamp": "2025-09-30 22:06:31.527916", "step": 390, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:32.179927", "step": 390, "epoch": 1 }, { "type": "pplx", "content": 83420945.45658126, "timestamp": "2025-09-30 22:06:32.185006", "step": 390, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:32.227423", "step": 390, "epoch": 1 }, { "type": "loss", "content": 0.025691481307148933, "timestamp": "2025-09-30 22:06:32.243948", "step": 391, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:32.288555", "step": 391, "epoch": 1 }, { "type": "loss", "content": 0.021664824336767197, "timestamp": "2025-09-30 22:06:32.314261", "step": 392, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:32.351168", "step": 392, "epoch": 1 }, { "type": "loss", "content": 0.006012835539877415, "timestamp": "2025-09-30 22:06:32.365804", "step": 393, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:32.399460", "step": 393, "epoch": 1 }, { "type": "loss", "content": 0.014766584150493145, "timestamp": "2025-09-30 22:06:32.414902", "step": 394, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:32.465673", "step": 394, "epoch": 1 }, { "type": "loss", "content": 0.008303819224238396, "timestamp": "2025-09-30 22:06:32.470427", "step": 395, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:06:32.514446", "step": 395, "epoch": 1 }, { "type": "loss", "content": 0.005097079090774059, "timestamp": "2025-09-30 22:06:32.551370", "step": 396, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:32.585372", "step": 396, "epoch": 1 }, { "type": "loss", "content": 0.012645237147808075, "timestamp": "2025-09-30 22:06:32.589263", "step": 397, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:32.624533", "step": 397, "epoch": 1 }, { "type": "loss", "content": 0.032030485570430756, "timestamp": "2025-09-30 22:06:32.641255", "step": 398, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:32.688822", "step": 398, "epoch": 1 }, { "type": "loss", "content": 0.029321294277906418, "timestamp": "2025-09-30 22:06:32.703140", "step": 399, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:32.737028", "step": 399, "epoch": 1 }, { "type": "loss", "content": 0.014075069688260555, "timestamp": "2025-09-30 22:06:32.762510", "step": 400, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:32.801365", "step": 400, "epoch": 1 }, { "type": "loss", "content": 0.014327221550047398, "timestamp": "2025-09-30 22:06:32.810199", "step": 401, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:32.846501", "step": 401, "epoch": 1 }, { "type": "loss", "content": 0.04210897162556648, "timestamp": "2025-09-30 22:06:32.854213", "step": 402, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:32.893696", "step": 402, "epoch": 1 }, { "type": "loss", "content": 0.015767447650432587, "timestamp": "2025-09-30 22:06:32.901172", "step": 403, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:32.942741", "step": 403, "epoch": 1 }, { "type": "loss", "content": 0.025074811652302742, "timestamp": "2025-09-30 22:06:32.967377", "step": 404, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:06:33.000429", "step": 404, "epoch": 1 }, { "type": "loss", "content": 0.01254369132220745, "timestamp": "2025-09-30 22:06:33.008230", "step": 405, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:33.042356", "step": 405, "epoch": 1 }, { "type": "loss", "content": 0.0043205274268984795, "timestamp": "2025-09-30 22:06:33.047248", "step": 406, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:33.085128", "step": 406, "epoch": 1 }, { "type": "loss", "content": 0.019118309020996094, "timestamp": "2025-09-30 22:06:33.092620", "step": 407, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:33.126741", "step": 407, "epoch": 1 }, { "type": "loss", "content": 0.004540651571005583, "timestamp": "2025-09-30 22:06:33.152227", "step": 408, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:33.190413", "step": 408, "epoch": 1 }, { "type": "loss", "content": 0.012727133929729462, "timestamp": "2025-09-30 22:06:33.194032", "step": 409, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:33.230731", "step": 409, "epoch": 1 }, { "type": "loss", "content": 0.007929746992886066, "timestamp": "2025-09-30 22:06:33.232861", "step": 410, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:33.269768", "step": 410, "epoch": 1 }, { "type": "loss", "content": 0.01387192402034998, "timestamp": "2025-09-30 22:06:33.273785", "step": 411, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:33.307647", "step": 411, "epoch": 1 }, { "type": "loss", "content": 0.0033554253168404102, "timestamp": "2025-09-30 22:06:33.331754", "step": 412, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:33.369476", "step": 412, "epoch": 1 }, { "type": "loss", "content": 0.013880507089197636, "timestamp": "2025-09-30 22:06:33.373824", "step": 413, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:33.410753", "step": 413, "epoch": 1 }, { "type": "loss", "content": 0.009252405725419521, "timestamp": "2025-09-30 22:06:33.415270", "step": 414, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:33.454325", "step": 414, "epoch": 1 }, { "type": "loss", "content": 0.01770790107548237, "timestamp": "2025-09-30 22:06:33.461238", "step": 415, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:33.502321", "step": 415, "epoch": 1 }, { "type": "loss", "content": 0.019153054803609848, "timestamp": "2025-09-30 22:06:33.531090", "step": 416, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:33.572115", "step": 416, "epoch": 1 }, { "type": "loss", "content": 0.010953940451145172, "timestamp": "2025-09-30 22:06:33.576740", "step": 417, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:33.616997", "step": 417, "epoch": 1 }, { "type": "loss", "content": 0.014136207289993763, "timestamp": "2025-09-30 22:06:33.623330", "step": 418, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 336 ], "flops": 9966940982208 }, "timestamp": "2025-09-30 22:06:33.676826", "step": 418, "epoch": 1 }, { "type": "loss", "content": 0.011414690874516964, "timestamp": "2025-09-30 22:06:33.685449", "step": 419, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:33.723416", "step": 419, "epoch": 1 }, { "type": "loss", "content": 0.02028883434832096, "timestamp": "2025-09-30 22:06:33.753209", "step": 420, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:33.790617", "step": 420, "epoch": 1 }, { "type": "loss", "content": 0.00771242706105113, "timestamp": "2025-09-30 22:06:33.798423", "step": 421, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:33.833347", "step": 421, "epoch": 1 }, { "type": "loss", "content": 0.030433373525738716, "timestamp": "2025-09-30 22:06:33.843145", "step": 422, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:33.884278", "step": 422, "epoch": 1 }, { "type": "loss", "content": 0.004011732060462236, "timestamp": "2025-09-30 22:06:33.893197", "step": 423, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:33.930303", "step": 423, "epoch": 1 }, { "type": "loss", "content": 0.014322788454592228, "timestamp": "2025-09-30 22:06:33.954749", "step": 424, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:33.990666", "step": 424, "epoch": 1 }, { "type": "loss", "content": 0.024081062525510788, "timestamp": "2025-09-30 22:06:33.998713", "step": 425, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:34.032908", "step": 425, "epoch": 1 }, { "type": "loss", "content": 0.004053601063787937, "timestamp": "2025-09-30 22:06:34.039958", "step": 426, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:34.078596", "step": 426, "epoch": 1 }, { "type": "loss", "content": 0.00806428026407957, "timestamp": "2025-09-30 22:06:34.082886", "step": 427, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:34.119467", "step": 427, "epoch": 1 }, { "type": "loss", "content": 0.013106881640851498, "timestamp": "2025-09-30 22:06:34.143662", "step": 428, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:34.179606", "step": 428, "epoch": 1 }, { "type": "loss", "content": 0.020720576867461205, "timestamp": "2025-09-30 22:06:34.182418", "step": 429, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:34.827798", "step": 429, "epoch": 1 }, { "type": "pplx", "content": 90011934.36599956, "timestamp": "2025-09-30 22:06:34.837057", "step": 429, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:34.875069", "step": 429, "epoch": 1 }, { "type": "loss", "content": 0.006069289054721594, "timestamp": "2025-09-30 22:06:34.882360", "step": 430, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:34.919387", "step": 430, "epoch": 1 }, { "type": "loss", "content": 0.011751479469239712, "timestamp": "2025-09-30 22:06:34.927437", "step": 431, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:34.969882", "step": 431, "epoch": 1 }, { "type": "loss", "content": 0.018163446336984634, "timestamp": "2025-09-30 22:06:34.994177", "step": 432, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:35.036152", "step": 432, "epoch": 1 }, { "type": "loss", "content": 0.00829459261149168, "timestamp": "2025-09-30 22:06:35.046603", "step": 433, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:35.081522", "step": 433, "epoch": 1 }, { "type": "loss", "content": 0.021889938041567802, "timestamp": "2025-09-30 22:06:35.085410", "step": 434, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:35.127262", "step": 434, "epoch": 1 }, { "type": "loss", "content": 0.017087522894144058, "timestamp": "2025-09-30 22:06:35.133589", "step": 435, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:35.169755", "step": 435, "epoch": 1 }, { "type": "loss", "content": 0.01923413760960102, "timestamp": "2025-09-30 22:06:35.198620", "step": 436, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:35.233508", "step": 436, "epoch": 1 }, { "type": "loss", "content": 0.0019212139304727316, "timestamp": "2025-09-30 22:06:35.237869", "step": 437, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:35.275977", "step": 437, "epoch": 1 }, { "type": "loss", "content": 0.012549689039587975, "timestamp": "2025-09-30 22:06:35.283593", "step": 438, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:35.323249", "step": 438, "epoch": 1 }, { "type": "loss", "content": 0.0119699751958251, "timestamp": "2025-09-30 22:06:35.327989", "step": 439, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:35.365424", "step": 439, "epoch": 1 }, { "type": "loss", "content": 0.030648184940218925, "timestamp": "2025-09-30 22:06:35.396401", "step": 440, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:35.435955", "step": 440, "epoch": 1 }, { "type": "loss", "content": 0.005409966688603163, "timestamp": "2025-09-30 22:06:35.440599", "step": 441, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:35.475640", "step": 441, "epoch": 1 }, { "type": "loss", "content": 0.03966454416513443, "timestamp": "2025-09-30 22:06:35.483423", "step": 442, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:35.521964", "step": 442, "epoch": 1 }, { "type": "loss", "content": 0.0014881438110023737, "timestamp": "2025-09-30 22:06:35.529594", "step": 443, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:35.567583", "step": 443, "epoch": 1 }, { "type": "loss", "content": 0.006455257534980774, "timestamp": "2025-09-30 22:06:35.597721", "step": 444, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:35.635404", "step": 444, "epoch": 1 }, { "type": "loss", "content": 0.0095746461302042, "timestamp": "2025-09-30 22:06:35.643666", "step": 445, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:35.678266", "step": 445, "epoch": 1 }, { "type": "loss", "content": 0.030881425365805626, "timestamp": "2025-09-30 22:06:35.681990", "step": 446, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:35.716077", "step": 446, "epoch": 1 }, { "type": "loss", "content": 0.014141733758151531, "timestamp": "2025-09-30 22:06:35.719478", "step": 447, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:35.751388", "step": 447, "epoch": 1 }, { "type": "loss", "content": 0.028649205341935158, "timestamp": "2025-09-30 22:06:35.778834", "step": 448, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:35.818945", "step": 448, "epoch": 1 }, { "type": "loss", "content": 0.010160253383219242, "timestamp": "2025-09-30 22:06:35.826430", "step": 449, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:35.866387", "step": 449, "epoch": 1 }, { "type": "loss", "content": 0.01143181324005127, "timestamp": "2025-09-30 22:06:35.873409", "step": 450, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:35.920147", "step": 450, "epoch": 1 }, { "type": "loss", "content": 0.0035703144967556, "timestamp": "2025-09-30 22:06:35.926330", "step": 451, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:35.968411", "step": 451, "epoch": 1 }, { "type": "loss", "content": 0.027362186461687088, "timestamp": "2025-09-30 22:06:35.998323", "step": 452, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:36.035135", "step": 452, "epoch": 1 }, { "type": "loss", "content": 0.003247453598305583, "timestamp": "2025-09-30 22:06:36.039382", "step": 453, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:36.073149", "step": 453, "epoch": 1 }, { "type": "loss", "content": 0.02386678196489811, "timestamp": "2025-09-30 22:06:36.078054", "step": 454, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:36.113306", "step": 454, "epoch": 1 }, { "type": "loss", "content": 0.016726380214095116, "timestamp": "2025-09-30 22:06:36.117638", "step": 455, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:36.152504", "step": 455, "epoch": 1 }, { "type": "loss", "content": 0.017773736268281937, "timestamp": "2025-09-30 22:06:36.180545", "step": 456, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:06:36.215074", "step": 456, "epoch": 1 }, { "type": "loss", "content": 0.0035495886113494635, "timestamp": "2025-09-30 22:06:36.222851", "step": 457, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:36.255024", "step": 457, "epoch": 1 }, { "type": "loss", "content": 0.011258290149271488, "timestamp": "2025-09-30 22:06:36.262264", "step": 458, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:36.295548", "step": 458, "epoch": 1 }, { "type": "loss", "content": 0.02230708673596382, "timestamp": "2025-09-30 22:06:36.298473", "step": 459, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:36.331412", "step": 459, "epoch": 1 }, { "type": "loss", "content": 0.011926496401429176, "timestamp": "2025-09-30 22:06:36.355748", "step": 460, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:36.387974", "step": 460, "epoch": 1 }, { "type": "loss", "content": 0.028327742591500282, "timestamp": "2025-09-30 22:06:36.391390", "step": 461, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:36.422876", "step": 461, "epoch": 1 }, { "type": "loss", "content": 0.012645123526453972, "timestamp": "2025-09-30 22:06:36.425790", "step": 462, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:36.457933", "step": 462, "epoch": 1 }, { "type": "loss", "content": 0.016447247937321663, "timestamp": "2025-09-30 22:06:36.464285", "step": 463, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:36.496547", "step": 463, "epoch": 1 }, { "type": "loss", "content": 0.006226507015526295, "timestamp": "2025-09-30 22:06:36.521221", "step": 464, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:36.552674", "step": 464, "epoch": 1 }, { "type": "loss", "content": 0.011151151731610298, "timestamp": "2025-09-30 22:06:36.555628", "step": 465, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:36.587307", "step": 465, "epoch": 1 }, { "type": "loss", "content": 0.004149741493165493, "timestamp": "2025-09-30 22:06:36.590128", "step": 466, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:36.622240", "step": 466, "epoch": 1 }, { "type": "loss", "content": 0.0014940766850486398, "timestamp": "2025-09-30 22:06:36.625252", "step": 467, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:36.656873", "step": 467, "epoch": 1 }, { "type": "loss", "content": 0.014169608242809772, "timestamp": "2025-09-30 22:06:36.682050", "step": 468, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:37.284277", "step": 468, "epoch": 1 }, { "type": "pplx", "content": 95003170.83514951, "timestamp": "2025-09-30 22:06:37.286957", "step": 468, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:37.316111", "step": 468, "epoch": 1 }, { "type": "loss", "content": 0.023486703634262085, "timestamp": "2025-09-30 22:06:37.319361", "step": 469, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:37.350717", "step": 469, "epoch": 1 }, { "type": "loss", "content": 0.008476106449961662, "timestamp": "2025-09-30 22:06:37.353756", "step": 470, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:37.385655", "step": 470, "epoch": 1 }, { "type": "loss", "content": 0.0017679741140455008, "timestamp": "2025-09-30 22:06:37.390012", "step": 471, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:37.424819", "step": 471, "epoch": 1 }, { "type": "loss", "content": 0.008707636035978794, "timestamp": "2025-09-30 22:06:37.450531", "step": 472, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:37.482016", "step": 472, "epoch": 1 }, { "type": "loss", "content": 0.03412850573658943, "timestamp": "2025-09-30 22:06:37.486702", "step": 473, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:37.518160", "step": 473, "epoch": 1 }, { "type": "loss", "content": 0.025019986554980278, "timestamp": "2025-09-30 22:06:37.520467", "step": 474, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:37.551497", "step": 474, "epoch": 1 }, { "type": "loss", "content": 0.001524191233329475, "timestamp": "2025-09-30 22:06:37.555988", "step": 475, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:37.588908", "step": 475, "epoch": 1 }, { "type": "loss", "content": 0.027583520859479904, "timestamp": "2025-09-30 22:06:37.617003", "step": 476, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:37.647683", "step": 476, "epoch": 1 }, { "type": "loss", "content": 0.03453077748417854, "timestamp": "2025-09-30 22:06:37.649888", "step": 477, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:37.679918", "step": 477, "epoch": 1 }, { "type": "loss", "content": 0.024835318326950073, "timestamp": "2025-09-30 22:06:37.683358", "step": 478, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:37.715537", "step": 478, "epoch": 1 }, { "type": "loss", "content": 0.040151309221982956, "timestamp": "2025-09-30 22:06:37.719847", "step": 479, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:37.750700", "step": 479, "epoch": 1 }, { "type": "loss", "content": 0.0036200087051838636, "timestamp": "2025-09-30 22:06:37.774770", "step": 480, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:37.808959", "step": 480, "epoch": 1 }, { "type": "loss", "content": 0.006020395550876856, "timestamp": "2025-09-30 22:06:37.814228", "step": 481, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:37.848101", "step": 481, "epoch": 1 }, { "type": "loss", "content": 0.003234736854210496, "timestamp": "2025-09-30 22:06:37.853641", "step": 482, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:37.888226", "step": 482, "epoch": 1 }, { "type": "loss", "content": 0.03566857427358627, "timestamp": "2025-09-30 22:06:37.892668", "step": 483, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:37.925052", "step": 483, "epoch": 1 }, { "type": "loss", "content": 0.019549470394849777, "timestamp": "2025-09-30 22:06:37.949502", "step": 484, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:06:37.981186", "step": 484, "epoch": 1 }, { "type": "loss", "content": 0.025290412828326225, "timestamp": "2025-09-30 22:06:37.988894", "step": 485, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:38.019618", "step": 485, "epoch": 1 }, { "type": "loss", "content": 0.012105834670364857, "timestamp": "2025-09-30 22:06:38.022315", "step": 486, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:38.052856", "step": 486, "epoch": 1 }, { "type": "loss", "content": 0.013042249716818333, "timestamp": "2025-09-30 22:06:38.055536", "step": 487, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:38.086616", "step": 487, "epoch": 1 }, { "type": "loss", "content": 0.01455900352448225, "timestamp": "2025-09-30 22:06:38.110658", "step": 488, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:38.140873", "step": 488, "epoch": 1 }, { "type": "loss", "content": 0.005409404635429382, "timestamp": "2025-09-30 22:06:38.143175", "step": 489, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:38.173671", "step": 489, "epoch": 1 }, { "type": "loss", "content": 0.009504382498562336, "timestamp": "2025-09-30 22:06:38.177664", "step": 490, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:38.208613", "step": 490, "epoch": 1 }, { "type": "loss", "content": 0.015173339284956455, "timestamp": "2025-09-30 22:06:38.211266", "step": 491, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:38.242847", "step": 491, "epoch": 1 }, { "type": "loss", "content": 0.007179237436503172, "timestamp": "2025-09-30 22:06:38.266869", "step": 492, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:38.298667", "step": 492, "epoch": 1 }, { "type": "loss", "content": 0.005810712929815054, "timestamp": "2025-09-30 22:06:38.304119", "step": 493, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:38.335720", "step": 493, "epoch": 1 }, { "type": "loss", "content": 0.015788041055202484, "timestamp": "2025-09-30 22:06:38.339771", "step": 494, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:38.371143", "step": 494, "epoch": 1 }, { "type": "loss", "content": 0.011624081991612911, "timestamp": "2025-09-30 22:06:38.373470", "step": 495, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:38.405351", "step": 495, "epoch": 1 }, { "type": "loss", "content": 0.022120699286460876, "timestamp": "2025-09-30 22:06:38.430807", "step": 496, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:38.462152", "step": 496, "epoch": 1 }, { "type": "loss", "content": 0.0297573059797287, "timestamp": "2025-09-30 22:06:38.464812", "step": 497, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:38.495593", "step": 497, "epoch": 1 }, { "type": "loss", "content": 0.01783009059727192, "timestamp": "2025-09-30 22:06:38.503357", "step": 498, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:38.535523", "step": 498, "epoch": 1 }, { "type": "loss", "content": 0.019278477877378464, "timestamp": "2025-09-30 22:06:38.538085", "step": 499, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:38.569936", "step": 499, "epoch": 1 }, { "type": "loss", "content": 0.022850925102829933, "timestamp": "2025-09-30 22:06:38.597572", "step": 500, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 500", "timestamp": "2025-09-30 22:06:44.217899", "step": 500, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:44.253369", "step": 500, "epoch": 1 }, { "type": "loss", "content": 0.041175130754709244, "timestamp": "2025-09-30 22:06:44.256246", "step": 501, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:44.292319", "step": 501, "epoch": 1 }, { "type": "loss", "content": 0.002944961655884981, "timestamp": "2025-09-30 22:06:44.295123", "step": 502, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-30 22:06:44.328460", "step": 502, "epoch": 1 }, { "type": "loss", "content": 0.01657816581428051, "timestamp": "2025-09-30 22:06:44.340399", "step": 503, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:44.374189", "step": 503, "epoch": 1 }, { "type": "loss", "content": 0.022021254524588585, "timestamp": "2025-09-30 22:06:44.399146", "step": 504, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:44.432597", "step": 504, "epoch": 1 }, { "type": "loss", "content": 0.033169493079185486, "timestamp": "2025-09-30 22:06:44.434888", "step": 505, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:44.468761", "step": 505, "epoch": 1 }, { "type": "loss", "content": 0.029171505942940712, "timestamp": "2025-09-30 22:06:44.473321", "step": 506, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:44.506028", "step": 506, "epoch": 1 }, { "type": "loss", "content": 0.010595436207950115, "timestamp": "2025-09-30 22:06:44.513711", "step": 507, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:45.133572", "step": 507, "epoch": 1 }, { "type": "pplx", "content": 97756475.1005249, "timestamp": "2025-09-30 22:06:45.135637", "step": 507, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:45.165574", "step": 507, "epoch": 1 }, { "type": "loss", "content": 0.04366924986243248, "timestamp": "2025-09-30 22:06:45.189954", "step": 508, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:45.221574", "step": 508, "epoch": 1 }, { "type": "loss", "content": 0.008732793852686882, "timestamp": "2025-09-30 22:06:45.223449", "step": 509, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:45.254977", "step": 509, "epoch": 1 }, { "type": "loss", "content": 0.007448369171470404, "timestamp": "2025-09-30 22:06:45.261355", "step": 510, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:45.296559", "step": 510, "epoch": 1 }, { "type": "loss", "content": 0.016796164214611053, "timestamp": "2025-09-30 22:06:45.298505", "step": 511, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:45.330981", "step": 511, "epoch": 1 }, { "type": "loss", "content": 0.03324023261666298, "timestamp": "2025-09-30 22:06:45.355674", "step": 512, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:45.389308", "step": 512, "epoch": 1 }, { "type": "loss", "content": 0.047144342213869095, "timestamp": "2025-09-30 22:06:45.393762", "step": 513, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:45.427924", "step": 513, "epoch": 1 }, { "type": "loss", "content": 0.007025440223515034, "timestamp": "2025-09-30 22:06:45.431507", "step": 514, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:45.463062", "step": 514, "epoch": 1 }, { "type": "loss", "content": 0.00455810921266675, "timestamp": "2025-09-30 22:06:45.469968", "step": 515, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 80 ], "flops": 2373281365952 }, "timestamp": "2025-09-30 22:06:45.515132", "step": 515, "epoch": 1 }, { "type": "loss", "content": 0.013665534555912018, "timestamp": "2025-09-30 22:06:45.539072", "step": 516, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:45.570113", "step": 516, "epoch": 1 }, { "type": "loss", "content": 0.025386126711964607, "timestamp": "2025-09-30 22:06:45.573121", "step": 517, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:45.603901", "step": 517, "epoch": 1 }, { "type": "loss", "content": 0.005903006065636873, "timestamp": "2025-09-30 22:06:45.611019", "step": 518, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:45.642060", "step": 518, "epoch": 1 }, { "type": "loss", "content": 0.016727006062865257, "timestamp": "2025-09-30 22:06:45.646298", "step": 519, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:45.677346", "step": 519, "epoch": 1 }, { "type": "loss", "content": 0.018626097589731216, "timestamp": "2025-09-30 22:06:45.700973", "step": 520, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:45.734530", "step": 520, "epoch": 1 }, { "type": "loss", "content": 0.01859509013593197, "timestamp": "2025-09-30 22:06:45.736463", "step": 521, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:45.767401", "step": 521, "epoch": 1 }, { "type": "loss", "content": 0.006714188493788242, "timestamp": "2025-09-30 22:06:45.769391", "step": 522, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:45.799905", "step": 522, "epoch": 1 }, { "type": "loss", "content": 0.03969809412956238, "timestamp": "2025-09-30 22:06:45.804326", "step": 523, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:45.834668", "step": 523, "epoch": 1 }, { "type": "loss", "content": 0.012870178557932377, "timestamp": "2025-09-30 22:06:45.862622", "step": 524, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:45.894090", "step": 524, "epoch": 1 }, { "type": "loss", "content": 0.01598569191992283, "timestamp": "2025-09-30 22:06:45.896567", "step": 525, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:45.929336", "step": 525, "epoch": 1 }, { "type": "loss", "content": 0.014287451282143593, "timestamp": "2025-09-30 22:06:45.933695", "step": 526, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:45.965407", "step": 526, "epoch": 1 }, { "type": "loss", "content": 0.023874444887042046, "timestamp": "2025-09-30 22:06:45.972665", "step": 527, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:46.008302", "step": 527, "epoch": 1 }, { "type": "loss", "content": 0.008932037279009819, "timestamp": "2025-09-30 22:06:46.038681", "step": 528, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:46.073979", "step": 528, "epoch": 1 }, { "type": "loss", "content": 0.02752871811389923, "timestamp": "2025-09-30 22:06:46.075778", "step": 529, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:46.106264", "step": 529, "epoch": 1 }, { "type": "loss", "content": 0.023428723216056824, "timestamp": "2025-09-30 22:06:46.108639", "step": 530, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:46.139631", "step": 530, "epoch": 1 }, { "type": "loss", "content": 0.00998811237514019, "timestamp": "2025-09-30 22:06:46.141840", "step": 531, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:46.172381", "step": 531, "epoch": 1 }, { "type": "loss", "content": 0.013783496804535389, "timestamp": "2025-09-30 22:06:46.196515", "step": 532, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:46.228967", "step": 532, "epoch": 1 }, { "type": "loss", "content": 0.023418204858899117, "timestamp": "2025-09-30 22:06:46.231291", "step": 533, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:46.262575", "step": 533, "epoch": 1 }, { "type": "loss", "content": 0.004461606964468956, "timestamp": "2025-09-30 22:06:46.266706", "step": 534, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:46.296963", "step": 534, "epoch": 1 }, { "type": "loss", "content": 0.009363941848278046, "timestamp": "2025-09-30 22:06:46.299277", "step": 535, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:46.330624", "step": 535, "epoch": 1 }, { "type": "loss", "content": 0.00438813166692853, "timestamp": "2025-09-30 22:06:46.355677", "step": 536, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:46.387169", "step": 536, "epoch": 1 }, { "type": "loss", "content": 0.014785653911530972, "timestamp": "2025-09-30 22:06:46.389247", "step": 537, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:46.420169", "step": 537, "epoch": 1 }, { "type": "loss", "content": 0.017323726788163185, "timestamp": "2025-09-30 22:06:46.424442", "step": 538, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:46.457355", "step": 538, "epoch": 1 }, { "type": "loss", "content": 0.009038321673870087, "timestamp": "2025-09-30 22:06:46.464317", "step": 539, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:46.495231", "step": 539, "epoch": 1 }, { "type": "loss", "content": 0.010455826297402382, "timestamp": "2025-09-30 22:06:46.520585", "step": 540, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:46.551367", "step": 540, "epoch": 1 }, { "type": "loss", "content": 0.028162604197859764, "timestamp": "2025-09-30 22:06:46.555749", "step": 541, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:46.586131", "step": 541, "epoch": 1 }, { "type": "loss", "content": 0.02013913355767727, "timestamp": "2025-09-30 22:06:46.588948", "step": 542, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:46.619608", "step": 542, "epoch": 1 }, { "type": "loss", "content": 0.01947060227394104, "timestamp": "2025-09-30 22:06:46.623840", "step": 543, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:46.655541", "step": 543, "epoch": 1 }, { "type": "loss", "content": 0.01323962863534689, "timestamp": "2025-09-30 22:06:46.678950", "step": 544, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:46.710010", "step": 544, "epoch": 1 }, { "type": "loss", "content": 0.01126060914248228, "timestamp": "2025-09-30 22:06:46.712091", "step": 545, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:46.742635", "step": 545, "epoch": 1 }, { "type": "loss", "content": 0.013927541673183441, "timestamp": "2025-09-30 22:06:46.747131", "step": 546, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:47.352604", "step": 546, "epoch": 1 }, { "type": "pplx", "content": 94871955.23744044, "timestamp": "2025-09-30 22:06:47.354628", "step": 546, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:47.383705", "step": 546, "epoch": 1 }, { "type": "loss", "content": 0.01951242797076702, "timestamp": "2025-09-30 22:06:47.390754", "step": 547, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:47.420889", "step": 547, "epoch": 1 }, { "type": "loss", "content": 0.015194809064269066, "timestamp": "2025-09-30 22:06:47.444712", "step": 548, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:47.475353", "step": 548, "epoch": 1 }, { "type": "loss", "content": 0.008021430112421513, "timestamp": "2025-09-30 22:06:47.477531", "step": 549, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:47.507989", "step": 549, "epoch": 1 }, { "type": "loss", "content": 0.0122072147205472, "timestamp": "2025-09-30 22:06:47.509933", "step": 550, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:47.539927", "step": 550, "epoch": 1 }, { "type": "loss", "content": 0.008672057650983334, "timestamp": "2025-09-30 22:06:47.550966", "step": 551, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:47.581627", "step": 551, "epoch": 1 }, { "type": "loss", "content": 0.012174529954791069, "timestamp": "2025-09-30 22:06:47.609948", "step": 552, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:47.644397", "step": 552, "epoch": 1 }, { "type": "loss", "content": 0.015683891251683235, "timestamp": "2025-09-30 22:06:47.646238", "step": 553, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:47.678675", "step": 553, "epoch": 1 }, { "type": "loss", "content": 0.013084937818348408, "timestamp": "2025-09-30 22:06:47.682799", "step": 554, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:47.718072", "step": 554, "epoch": 1 }, { "type": "loss", "content": 0.00986995454877615, "timestamp": "2025-09-30 22:06:47.719894", "step": 555, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:47.753102", "step": 555, "epoch": 1 }, { "type": "loss", "content": 0.018128668889403343, "timestamp": "2025-09-30 22:06:47.778525", "step": 556, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:47.811602", "step": 556, "epoch": 1 }, { "type": "loss", "content": 0.009639596566557884, "timestamp": "2025-09-30 22:06:47.813796", "step": 557, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:47.850136", "step": 557, "epoch": 1 }, { "type": "loss", "content": 0.007117317058146, "timestamp": "2025-09-30 22:06:47.854701", "step": 558, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:47.885597", "step": 558, "epoch": 1 }, { "type": "loss", "content": 0.011486091651022434, "timestamp": "2025-09-30 22:06:47.888372", "step": 559, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:47.921996", "step": 559, "epoch": 1 }, { "type": "loss", "content": 0.010793984867632389, "timestamp": "2025-09-30 22:06:47.947000", "step": 560, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:47.978081", "step": 560, "epoch": 1 }, { "type": "loss", "content": 0.020686279982328415, "timestamp": "2025-09-30 22:06:47.982606", "step": 561, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:48.013558", "step": 561, "epoch": 1 }, { "type": "loss", "content": 0.007002585101872683, "timestamp": "2025-09-30 22:06:48.020464", "step": 562, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:48.051491", "step": 562, "epoch": 1 }, { "type": "loss", "content": 0.020491817966103554, "timestamp": "2025-09-30 22:06:48.053816", "step": 563, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:48.085236", "step": 563, "epoch": 1 }, { "type": "loss", "content": 0.008508813567459583, "timestamp": "2025-09-30 22:06:48.110669", "step": 564, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:48.141790", "step": 564, "epoch": 1 }, { "type": "loss", "content": 0.011555018834769726, "timestamp": "2025-09-30 22:06:48.146402", "step": 565, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:48.178082", "step": 565, "epoch": 1 }, { "type": "loss", "content": 0.008152040652930737, "timestamp": "2025-09-30 22:06:48.180043", "step": 566, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:48.211126", "step": 566, "epoch": 1 }, { "type": "loss", "content": 0.008576711639761925, "timestamp": "2025-09-30 22:06:48.213275", "step": 567, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:48.244360", "step": 567, "epoch": 1 }, { "type": "loss", "content": 0.035935334861278534, "timestamp": "2025-09-30 22:06:48.272396", "step": 568, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:48.302643", "step": 568, "epoch": 1 }, { "type": "loss", "content": 0.0031208812724798918, "timestamp": "2025-09-30 22:06:48.304735", "step": 569, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:48.334856", "step": 569, "epoch": 1 }, { "type": "loss", "content": 0.01960751973092556, "timestamp": "2025-09-30 22:06:48.337107", "step": 570, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:48.367701", "step": 570, "epoch": 1 }, { "type": "loss", "content": 0.0027217951137572527, "timestamp": "2025-09-30 22:06:48.374781", "step": 571, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:48.407413", "step": 571, "epoch": 1 }, { "type": "loss", "content": 0.0036713494919240475, "timestamp": "2025-09-30 22:06:48.435047", "step": 572, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:48.468151", "step": 572, "epoch": 1 }, { "type": "loss", "content": 0.015845585614442825, "timestamp": "2025-09-30 22:06:48.470177", "step": 573, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:48.501027", "step": 573, "epoch": 1 }, { "type": "loss", "content": 0.0014490272151306272, "timestamp": "2025-09-30 22:06:48.508020", "step": 574, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:48.542698", "step": 574, "epoch": 1 }, { "type": "loss", "content": 0.01992383971810341, "timestamp": "2025-09-30 22:06:48.550367", "step": 575, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:48.581294", "step": 575, "epoch": 1 }, { "type": "loss", "content": 0.003482742002233863, "timestamp": "2025-09-30 22:06:48.605318", "step": 576, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:48.635302", "step": 576, "epoch": 1 }, { "type": "loss", "content": 0.0027481913566589355, "timestamp": "2025-09-30 22:06:48.637620", "step": 577, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:48.671067", "step": 577, "epoch": 1 }, { "type": "loss", "content": 0.005112276412546635, "timestamp": "2025-09-30 22:06:48.675234", "step": 578, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:48.706603", "step": 578, "epoch": 1 }, { "type": "loss", "content": 0.025796279311180115, "timestamp": "2025-09-30 22:06:48.713700", "step": 579, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:48.747278", "step": 579, "epoch": 1 }, { "type": "loss", "content": 0.007455120328813791, "timestamp": "2025-09-30 22:06:48.772475", "step": 580, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:48.803941", "step": 580, "epoch": 1 }, { "type": "loss", "content": 0.01217992790043354, "timestamp": "2025-09-30 22:06:48.806195", "step": 581, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:48.838109", "step": 581, "epoch": 1 }, { "type": "loss", "content": 0.008093575946986675, "timestamp": "2025-09-30 22:06:48.845630", "step": 582, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:48.877918", "step": 582, "epoch": 1 }, { "type": "loss", "content": 0.008411646820604801, "timestamp": "2025-09-30 22:06:48.882043", "step": 583, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:48.914372", "step": 583, "epoch": 1 }, { "type": "loss", "content": 0.012758438475430012, "timestamp": "2025-09-30 22:06:48.941051", "step": 584, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:48.974717", "step": 584, "epoch": 1 }, { "type": "loss", "content": 0.03811389580368996, "timestamp": "2025-09-30 22:06:48.978889", "step": 585, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:49.627435", "step": 585, "epoch": 1 }, { "type": "pplx", "content": 102180968.26728095, "timestamp": "2025-09-30 22:06:49.632578", "step": 585, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:49.664846", "step": 585, "epoch": 1 }, { "type": "loss", "content": 0.0035328001249581575, "timestamp": "2025-09-30 22:06:49.666935", "step": 586, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:49.700507", "step": 586, "epoch": 1 }, { "type": "loss", "content": 0.00914231687784195, "timestamp": "2025-09-30 22:06:49.705252", "step": 587, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:49.737368", "step": 587, "epoch": 1 }, { "type": "loss", "content": 0.04065397009253502, "timestamp": "2025-09-30 22:06:49.760855", "step": 588, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:49.792496", "step": 588, "epoch": 1 }, { "type": "loss", "content": 0.010214920155704021, "timestamp": "2025-09-30 22:06:49.794593", "step": 589, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:49.825056", "step": 589, "epoch": 1 }, { "type": "loss", "content": 0.0308766420930624, "timestamp": "2025-09-30 22:06:49.827892", "step": 590, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:49.858220", "step": 590, "epoch": 1 }, { "type": "loss", "content": 0.024278799071907997, "timestamp": "2025-09-30 22:06:49.862898", "step": 591, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:49.893886", "step": 591, "epoch": 1 }, { "type": "loss", "content": 0.016640285030007362, "timestamp": "2025-09-30 22:06:49.918485", "step": 592, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:49.950722", "step": 592, "epoch": 1 }, { "type": "loss", "content": 0.02836495451629162, "timestamp": "2025-09-30 22:06:49.952780", "step": 593, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:49.982820", "step": 593, "epoch": 1 }, { "type": "loss", "content": 0.01918208971619606, "timestamp": "2025-09-30 22:06:49.985339", "step": 594, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:50.017009", "step": 594, "epoch": 1 }, { "type": "loss", "content": 0.021310951560735703, "timestamp": "2025-09-30 22:06:50.019831", "step": 595, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:50.051700", "step": 595, "epoch": 1 }, { "type": "loss", "content": 0.017431020736694336, "timestamp": "2025-09-30 22:06:50.075339", "step": 596, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:50.105535", "step": 596, "epoch": 1 }, { "type": "loss", "content": 0.03336421772837639, "timestamp": "2025-09-30 22:06:50.107671", "step": 597, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:50.138886", "step": 597, "epoch": 1 }, { "type": "loss", "content": 0.005631003994494677, "timestamp": "2025-09-30 22:06:50.141007", "step": 598, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:50.171960", "step": 598, "epoch": 1 }, { "type": "loss", "content": 0.04496116191148758, "timestamp": "2025-09-30 22:06:50.173807", "step": 599, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:50.204838", "step": 599, "epoch": 1 }, { "type": "loss", "content": 0.0018515412230044603, "timestamp": "2025-09-30 22:06:50.233766", "step": 600, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:50.265278", "step": 600, "epoch": 1 }, { "type": "loss", "content": 0.02544139325618744, "timestamp": "2025-09-30 22:06:50.267386", "step": 601, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:50.299119", "step": 601, "epoch": 1 }, { "type": "loss", "content": 0.015904691070318222, "timestamp": "2025-09-30 22:06:50.306245", "step": 602, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:50.337712", "step": 602, "epoch": 1 }, { "type": "loss", "content": 0.010545952245593071, "timestamp": "2025-09-30 22:06:50.340706", "step": 603, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:50.372483", "step": 603, "epoch": 1 }, { "type": "loss", "content": 0.0034892302937805653, "timestamp": "2025-09-30 22:06:50.397862", "step": 604, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:50.430005", "step": 604, "epoch": 1 }, { "type": "loss", "content": 0.025599991902709007, "timestamp": "2025-09-30 22:06:50.432156", "step": 605, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:50.463610", "step": 605, "epoch": 1 }, { "type": "loss", "content": 0.038173574954271317, "timestamp": "2025-09-30 22:06:50.465668", "step": 606, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:50.498393", "step": 606, "epoch": 1 }, { "type": "loss", "content": 0.04549602046608925, "timestamp": "2025-09-30 22:06:50.500506", "step": 607, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:50.531694", "step": 607, "epoch": 1 }, { "type": "loss", "content": 0.022459452971816063, "timestamp": "2025-09-30 22:06:50.555227", "step": 608, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:50.586267", "step": 608, "epoch": 1 }, { "type": "loss", "content": 0.032032039016485214, "timestamp": "2025-09-30 22:06:50.588709", "step": 609, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:50.621468", "step": 609, "epoch": 1 }, { "type": "loss", "content": 0.009997963905334473, "timestamp": "2025-09-30 22:06:50.623416", "step": 610, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:50.653701", "step": 610, "epoch": 1 }, { "type": "loss", "content": 0.015141839161515236, "timestamp": "2025-09-30 22:06:50.658433", "step": 611, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:50.690009", "step": 611, "epoch": 1 }, { "type": "loss", "content": 0.005697866436094046, "timestamp": "2025-09-30 22:06:50.718167", "step": 612, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:50.749478", "step": 612, "epoch": 1 }, { "type": "loss", "content": 0.014740964397788048, "timestamp": "2025-09-30 22:06:50.754150", "step": 613, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:50.785880", "step": 613, "epoch": 1 }, { "type": "loss", "content": 0.01579163782298565, "timestamp": "2025-09-30 22:06:50.787879", "step": 614, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:50.821175", "step": 614, "epoch": 1 }, { "type": "loss", "content": 0.01162130106240511, "timestamp": "2025-09-30 22:06:50.828468", "step": 615, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:50.859442", "step": 615, "epoch": 1 }, { "type": "loss", "content": 0.03986949473619461, "timestamp": "2025-09-30 22:06:50.883321", "step": 616, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:50.916008", "step": 616, "epoch": 1 }, { "type": "loss", "content": 0.023310085758566856, "timestamp": "2025-09-30 22:06:50.920911", "step": 617, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:50.953321", "step": 617, "epoch": 1 }, { "type": "loss", "content": 0.030140681192278862, "timestamp": "2025-09-30 22:06:50.955739", "step": 618, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:50.987428", "step": 618, "epoch": 1 }, { "type": "loss", "content": 0.019153164699673653, "timestamp": "2025-09-30 22:06:50.991908", "step": 619, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:51.027299", "step": 619, "epoch": 1 }, { "type": "loss", "content": 0.009747117757797241, "timestamp": "2025-09-30 22:06:51.054890", "step": 620, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:51.086707", "step": 620, "epoch": 1 }, { "type": "loss", "content": 0.021668700501322746, "timestamp": "2025-09-30 22:06:51.088771", "step": 621, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:51.122022", "step": 621, "epoch": 1 }, { "type": "loss", "content": 0.004393186420202255, "timestamp": "2025-09-30 22:06:51.129028", "step": 622, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 2, 192 ], "flops": 2847885110400 }, "timestamp": "2025-09-30 22:06:51.160010", "step": 622, "epoch": 1 }, { "type": "loss", "content": 0.027998492121696472, "timestamp": "2025-09-30 22:06:51.162315", "step": 623, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:51.211000", "step": 623, "epoch": 2 }, { "type": "loss", "content": 0.015790753066539764, "timestamp": "2025-09-30 22:06:51.234771", "step": 624, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:51.887027", "step": 624, "epoch": 2 }, { "type": "pplx", "content": 81566219.8820279, "timestamp": "2025-09-30 22:06:51.892545", "step": 624, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:51.925548", "step": 624, "epoch": 2 }, { "type": "loss", "content": 0.020654765889048576, "timestamp": "2025-09-30 22:06:51.930505", "step": 625, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:51.965820", "step": 625, "epoch": 2 }, { "type": "loss", "content": 0.011045219376683235, "timestamp": "2025-09-30 22:06:51.972662", "step": 626, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:52.006161", "step": 626, "epoch": 2 }, { "type": "loss", "content": 0.009478067047894001, "timestamp": "2025-09-30 22:06:52.017673", "step": 627, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:52.054213", "step": 627, "epoch": 2 }, { "type": "loss", "content": 0.004149326588958502, "timestamp": "2025-09-30 22:06:52.082814", "step": 628, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:52.115593", "step": 628, "epoch": 2 }, { "type": "loss", "content": 0.015558170154690742, "timestamp": "2025-09-30 22:06:52.120693", "step": 629, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:52.155400", "step": 629, "epoch": 2 }, { "type": "loss", "content": 0.012335834093391895, "timestamp": "2025-09-30 22:06:52.160827", "step": 630, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:52.194776", "step": 630, "epoch": 2 }, { "type": "loss", "content": 0.029784584417939186, "timestamp": "2025-09-30 22:06:52.199499", "step": 631, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:52.232499", "step": 631, "epoch": 2 }, { "type": "loss", "content": 0.015622490085661411, "timestamp": "2025-09-30 22:06:52.256461", "step": 632, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:52.291936", "step": 632, "epoch": 2 }, { "type": "loss", "content": 0.012185090221464634, "timestamp": "2025-09-30 22:06:52.295534", "step": 633, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:52.329014", "step": 633, "epoch": 2 }, { "type": "loss", "content": 0.040238264948129654, "timestamp": "2025-09-30 22:06:52.336786", "step": 634, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:52.369335", "step": 634, "epoch": 2 }, { "type": "loss", "content": 0.008953387849032879, "timestamp": "2025-09-30 22:06:52.373707", "step": 635, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:52.406464", "step": 635, "epoch": 2 }, { "type": "loss", "content": 0.008726871572434902, "timestamp": "2025-09-30 22:06:52.433313", "step": 636, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:52.471470", "step": 636, "epoch": 2 }, { "type": "loss", "content": 0.02807791158556938, "timestamp": "2025-09-30 22:06:52.477576", "step": 637, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:52.513510", "step": 637, "epoch": 2 }, { "type": "loss", "content": 0.021706465631723404, "timestamp": "2025-09-30 22:06:52.519779", "step": 638, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:52.555123", "step": 638, "epoch": 2 }, { "type": "loss", "content": 0.028942091390490532, "timestamp": "2025-09-30 22:06:52.562247", "step": 639, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:52.598303", "step": 639, "epoch": 2 }, { "type": "loss", "content": 0.021636173129081726, "timestamp": "2025-09-30 22:06:52.623774", "step": 640, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:52.658416", "step": 640, "epoch": 2 }, { "type": "loss", "content": 0.010957852937281132, "timestamp": "2025-09-30 22:06:52.663084", "step": 641, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:52.696076", "step": 641, "epoch": 2 }, { "type": "loss", "content": 0.015631377696990967, "timestamp": "2025-09-30 22:06:52.699458", "step": 642, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:52.733204", "step": 642, "epoch": 2 }, { "type": "loss", "content": 0.01614670641720295, "timestamp": "2025-09-30 22:06:52.740166", "step": 643, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:52.774097", "step": 643, "epoch": 2 }, { "type": "loss", "content": 0.020787807181477547, "timestamp": "2025-09-30 22:06:52.802283", "step": 644, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:52.833998", "step": 644, "epoch": 2 }, { "type": "loss", "content": 0.017705578356981277, "timestamp": "2025-09-30 22:06:52.836302", "step": 645, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:52.869466", "step": 645, "epoch": 2 }, { "type": "loss", "content": 0.022793063893914223, "timestamp": "2025-09-30 22:06:52.873043", "step": 646, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:52.906786", "step": 646, "epoch": 2 }, { "type": "loss", "content": 0.03414614126086235, "timestamp": "2025-09-30 22:06:52.910099", "step": 647, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:52.948576", "step": 647, "epoch": 2 }, { "type": "loss", "content": 0.006914376746863127, "timestamp": "2025-09-30 22:06:52.976574", "step": 648, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:53.016760", "step": 648, "epoch": 2 }, { "type": "loss", "content": 0.00787010695785284, "timestamp": "2025-09-30 22:06:53.026794", "step": 649, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:53.058331", "step": 649, "epoch": 2 }, { "type": "loss", "content": 0.037422794848680496, "timestamp": "2025-09-30 22:06:53.064791", "step": 650, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:53.099702", "step": 650, "epoch": 2 }, { "type": "loss", "content": 0.024876395240426064, "timestamp": "2025-09-30 22:06:53.105407", "step": 651, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:53.140219", "step": 651, "epoch": 2 }, { "type": "loss", "content": 0.013162984512746334, "timestamp": "2025-09-30 22:06:53.168991", "step": 652, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:53.207410", "step": 652, "epoch": 2 }, { "type": "loss", "content": 0.00827017705887556, "timestamp": "2025-09-30 22:06:53.212882", "step": 653, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:53.246307", "step": 653, "epoch": 2 }, { "type": "loss", "content": 0.005606896709650755, "timestamp": "2025-09-30 22:06:53.252067", "step": 654, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:53.287654", "step": 654, "epoch": 2 }, { "type": "loss", "content": 0.020719783380627632, "timestamp": "2025-09-30 22:06:53.290458", "step": 655, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:53.326242", "step": 655, "epoch": 2 }, { "type": "loss", "content": 0.018828241154551506, "timestamp": "2025-09-30 22:06:53.354213", "step": 656, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:53.385968", "step": 656, "epoch": 2 }, { "type": "loss", "content": 0.021952269598841667, "timestamp": "2025-09-30 22:06:53.390774", "step": 657, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:53.423627", "step": 657, "epoch": 2 }, { "type": "loss", "content": 0.012793275527656078, "timestamp": "2025-09-30 22:06:53.429179", "step": 658, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:53.464103", "step": 658, "epoch": 2 }, { "type": "loss", "content": 0.01424416247755289, "timestamp": "2025-09-30 22:06:53.466970", "step": 659, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:53.500331", "step": 659, "epoch": 2 }, { "type": "loss", "content": 0.020341385155916214, "timestamp": "2025-09-30 22:06:53.525516", "step": 660, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:53.556820", "step": 660, "epoch": 2 }, { "type": "loss", "content": 0.01396105531603098, "timestamp": "2025-09-30 22:06:53.560714", "step": 661, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:53.593855", "step": 661, "epoch": 2 }, { "type": "loss", "content": 0.015063964761793613, "timestamp": "2025-09-30 22:06:53.597857", "step": 662, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:53.630838", "step": 662, "epoch": 2 }, { "type": "loss", "content": 0.019370300695300102, "timestamp": "2025-09-30 22:06:53.636632", "step": 663, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:54.267786", "step": 663, "epoch": 2 }, { "type": "pplx", "content": 77489775.34361932, "timestamp": "2025-09-30 22:06:54.273186", "step": 663, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:54.306633", "step": 663, "epoch": 2 }, { "type": "loss", "content": 0.014374351128935814, "timestamp": "2025-09-30 22:06:54.336794", "step": 664, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:54.368160", "step": 664, "epoch": 2 }, { "type": "loss", "content": 0.027066349983215332, "timestamp": "2025-09-30 22:06:54.371663", "step": 665, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:54.403813", "step": 665, "epoch": 2 }, { "type": "loss", "content": 0.013484174385666847, "timestamp": "2025-09-30 22:06:54.408358", "step": 666, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:54.440538", "step": 666, "epoch": 2 }, { "type": "loss", "content": 0.045839279890060425, "timestamp": "2025-09-30 22:06:54.447773", "step": 667, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:54.479727", "step": 667, "epoch": 2 }, { "type": "loss", "content": 0.0061507937498390675, "timestamp": "2025-09-30 22:06:54.504350", "step": 668, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:54.535714", "step": 668, "epoch": 2 }, { "type": "loss", "content": 0.01860460638999939, "timestamp": "2025-09-30 22:06:54.538570", "step": 669, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:54.570544", "step": 669, "epoch": 2 }, { "type": "loss", "content": 0.02531057968735695, "timestamp": "2025-09-30 22:06:54.575076", "step": 670, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:54.609118", "step": 670, "epoch": 2 }, { "type": "loss", "content": 0.02880782261490822, "timestamp": "2025-09-30 22:06:54.613357", "step": 671, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:54.649347", "step": 671, "epoch": 2 }, { "type": "loss", "content": 0.02219448611140251, "timestamp": "2025-09-30 22:06:54.673704", "step": 672, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:54.704753", "step": 672, "epoch": 2 }, { "type": "loss", "content": 0.012237013317644596, "timestamp": "2025-09-30 22:06:54.713283", "step": 673, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:54.748688", "step": 673, "epoch": 2 }, { "type": "loss", "content": 0.020764613524079323, "timestamp": "2025-09-30 22:06:54.750742", "step": 674, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:54.781620", "step": 674, "epoch": 2 }, { "type": "loss", "content": 0.003978596068918705, "timestamp": "2025-09-30 22:06:54.786220", "step": 675, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:54.818060", "step": 675, "epoch": 2 }, { "type": "loss", "content": 0.028562063351273537, "timestamp": "2025-09-30 22:06:54.842486", "step": 676, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:54.876042", "step": 676, "epoch": 2 }, { "type": "loss", "content": 0.009312150999903679, "timestamp": "2025-09-30 22:06:54.879318", "step": 677, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:54.911098", "step": 677, "epoch": 2 }, { "type": "loss", "content": 0.014050443656742573, "timestamp": "2025-09-30 22:06:54.916843", "step": 678, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:54.951884", "step": 678, "epoch": 2 }, { "type": "loss", "content": 0.020990783348679543, "timestamp": "2025-09-30 22:06:54.956016", "step": 679, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:54.988202", "step": 679, "epoch": 2 }, { "type": "loss", "content": 0.01680334284901619, "timestamp": "2025-09-30 22:06:55.012484", "step": 680, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:55.046218", "step": 680, "epoch": 2 }, { "type": "loss", "content": 0.027634665369987488, "timestamp": "2025-09-30 22:06:55.050194", "step": 681, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:55.084154", "step": 681, "epoch": 2 }, { "type": "loss", "content": 0.017819708213210106, "timestamp": "2025-09-30 22:06:55.086959", "step": 682, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:55.117628", "step": 682, "epoch": 2 }, { "type": "loss", "content": 0.022069228813052177, "timestamp": "2025-09-30 22:06:55.121770", "step": 683, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:55.153780", "step": 683, "epoch": 2 }, { "type": "loss", "content": 0.013544796034693718, "timestamp": "2025-09-30 22:06:55.178806", "step": 684, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:55.211027", "step": 684, "epoch": 2 }, { "type": "loss", "content": 0.01913309469819069, "timestamp": "2025-09-30 22:06:55.213480", "step": 685, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:55.245152", "step": 685, "epoch": 2 }, { "type": "loss", "content": 0.009119086898863316, "timestamp": "2025-09-30 22:06:55.249343", "step": 686, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:55.281575", "step": 686, "epoch": 2 }, { "type": "loss", "content": 0.016176508739590645, "timestamp": "2025-09-30 22:06:55.286110", "step": 687, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:55.317259", "step": 687, "epoch": 2 }, { "type": "loss", "content": 0.020089006051421165, "timestamp": "2025-09-30 22:06:55.342494", "step": 688, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:55.373780", "step": 688, "epoch": 2 }, { "type": "loss", "content": 0.019316306337714195, "timestamp": "2025-09-30 22:06:55.376470", "step": 689, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:55.410938", "step": 689, "epoch": 2 }, { "type": "loss", "content": 0.012639305554330349, "timestamp": "2025-09-30 22:06:55.416491", "step": 690, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:55.450366", "step": 690, "epoch": 2 }, { "type": "loss", "content": 0.010887724347412586, "timestamp": "2025-09-30 22:06:55.454657", "step": 691, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:55.486573", "step": 691, "epoch": 2 }, { "type": "loss", "content": 0.029533464461565018, "timestamp": "2025-09-30 22:06:55.511717", "step": 692, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:55.543655", "step": 692, "epoch": 2 }, { "type": "loss", "content": 0.01161243673413992, "timestamp": "2025-09-30 22:06:55.546033", "step": 693, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:55.578061", "step": 693, "epoch": 2 }, { "type": "loss", "content": 0.014599710702896118, "timestamp": "2025-09-30 22:06:55.580023", "step": 694, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:55.613907", "step": 694, "epoch": 2 }, { "type": "loss", "content": 0.013431690633296967, "timestamp": "2025-09-30 22:06:55.615914", "step": 695, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:55.647063", "step": 695, "epoch": 2 }, { "type": "loss", "content": 0.015383687801659107, "timestamp": "2025-09-30 22:06:55.674080", "step": 696, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:55.707054", "step": 696, "epoch": 2 }, { "type": "loss", "content": 0.008616768755018711, "timestamp": "2025-09-30 22:06:55.710421", "step": 697, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:55.742911", "step": 697, "epoch": 2 }, { "type": "loss", "content": 0.004892031196504831, "timestamp": "2025-09-30 22:06:55.745233", "step": 698, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:55.776553", "step": 698, "epoch": 2 }, { "type": "loss", "content": 0.004173830151557922, "timestamp": "2025-09-30 22:06:55.781131", "step": 699, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:55.812352", "step": 699, "epoch": 2 }, { "type": "loss", "content": 0.019902020692825317, "timestamp": "2025-09-30 22:06:55.836066", "step": 700, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:55.867295", "step": 700, "epoch": 2 }, { "type": "loss", "content": 0.014127095229923725, "timestamp": "2025-09-30 22:06:55.873163", "step": 701, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:55.909001", "step": 701, "epoch": 2 }, { "type": "loss", "content": 0.018558593466877937, "timestamp": "2025-09-30 22:06:55.916765", "step": 702, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:56.536558", "step": 702, "epoch": 2 }, { "type": "pplx", "content": 78927595.24360396, "timestamp": "2025-09-30 22:06:56.538322", "step": 702, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:56.567337", "step": 702, "epoch": 2 }, { "type": "loss", "content": 0.006854670587927103, "timestamp": "2025-09-30 22:06:56.571745", "step": 703, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:56.604593", "step": 703, "epoch": 2 }, { "type": "loss", "content": 0.010047112591564655, "timestamp": "2025-09-30 22:06:56.632147", "step": 704, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:56.663550", "step": 704, "epoch": 2 }, { "type": "loss", "content": 0.007333940826356411, "timestamp": "2025-09-30 22:06:56.665848", "step": 705, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:56.696003", "step": 705, "epoch": 2 }, { "type": "loss", "content": 0.01448789145797491, "timestamp": "2025-09-30 22:06:56.703808", "step": 706, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:56.735622", "step": 706, "epoch": 2 }, { "type": "loss", "content": 0.016120653599500656, "timestamp": "2025-09-30 22:06:56.739942", "step": 707, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:56.770880", "step": 707, "epoch": 2 }, { "type": "loss", "content": 0.0023472048342227936, "timestamp": "2025-09-30 22:06:56.799131", "step": 708, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:56.829559", "step": 708, "epoch": 2 }, { "type": "loss", "content": 0.005357666406780481, "timestamp": "2025-09-30 22:06:56.831489", "step": 709, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:56.862387", "step": 709, "epoch": 2 }, { "type": "loss", "content": 0.016620632261037827, "timestamp": "2025-09-30 22:06:56.870095", "step": 710, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:56.900537", "step": 710, "epoch": 2 }, { "type": "loss", "content": 0.004693766124546528, "timestamp": "2025-09-30 22:06:56.902712", "step": 711, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:56.933407", "step": 711, "epoch": 2 }, { "type": "loss", "content": 0.020208735018968582, "timestamp": "2025-09-30 22:06:56.961351", "step": 712, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:56.994316", "step": 712, "epoch": 2 }, { "type": "loss", "content": 0.00886120367795229, "timestamp": "2025-09-30 22:06:56.996243", "step": 713, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:57.054146", "step": 713, "epoch": 2 }, { "type": "loss", "content": 0.008350329473614693, "timestamp": "2025-09-30 22:06:57.057065", "step": 714, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:57.089829", "step": 714, "epoch": 2 }, { "type": "loss", "content": 0.009579906240105629, "timestamp": "2025-09-30 22:06:57.094350", "step": 715, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:57.125239", "step": 715, "epoch": 2 }, { "type": "loss", "content": 0.02509910985827446, "timestamp": "2025-09-30 22:06:57.150330", "step": 716, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:57.181409", "step": 716, "epoch": 2 }, { "type": "loss", "content": 0.01436839159578085, "timestamp": "2025-09-30 22:06:57.183643", "step": 717, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:57.213826", "step": 717, "epoch": 2 }, { "type": "loss", "content": 0.0069790855050086975, "timestamp": "2025-09-30 22:06:57.218317", "step": 718, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:57.249048", "step": 718, "epoch": 2 }, { "type": "loss", "content": 0.023231515660881996, "timestamp": "2025-09-30 22:06:57.253207", "step": 719, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:57.284159", "step": 719, "epoch": 2 }, { "type": "loss", "content": 0.013670231215655804, "timestamp": "2025-09-30 22:06:57.307905", "step": 720, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:57.338303", "step": 720, "epoch": 2 }, { "type": "loss", "content": 0.025221526622772217, "timestamp": "2025-09-30 22:06:57.340345", "step": 721, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:57.371395", "step": 721, "epoch": 2 }, { "type": "loss", "content": 0.003092572558671236, "timestamp": "2025-09-30 22:06:57.379091", "step": 722, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:57.409678", "step": 722, "epoch": 2 }, { "type": "loss", "content": 0.0018878942355513573, "timestamp": "2025-09-30 22:06:57.412535", "step": 723, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:57.444380", "step": 723, "epoch": 2 }, { "type": "loss", "content": 0.005184180103242397, "timestamp": "2025-09-30 22:06:57.472480", "step": 724, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:57.503000", "step": 724, "epoch": 2 }, { "type": "loss", "content": 0.004255128558725119, "timestamp": "2025-09-30 22:06:57.505485", "step": 725, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:57.536205", "step": 725, "epoch": 2 }, { "type": "loss", "content": 0.01653963141143322, "timestamp": "2025-09-30 22:06:57.538885", "step": 726, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:57.569237", "step": 726, "epoch": 2 }, { "type": "loss", "content": 0.012326141819357872, "timestamp": "2025-09-30 22:06:57.573907", "step": 727, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:57.605730", "step": 727, "epoch": 2 }, { "type": "loss", "content": 0.00445201713591814, "timestamp": "2025-09-30 22:06:57.634845", "step": 728, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:57.668996", "step": 728, "epoch": 2 }, { "type": "loss", "content": 0.0036043657455593348, "timestamp": "2025-09-30 22:06:57.671120", "step": 729, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:57.701846", "step": 729, "epoch": 2 }, { "type": "loss", "content": 0.009894789196550846, "timestamp": "2025-09-30 22:06:57.706193", "step": 730, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:57.738040", "step": 730, "epoch": 2 }, { "type": "loss", "content": 0.00579670537263155, "timestamp": "2025-09-30 22:06:57.740011", "step": 731, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:57.771121", "step": 731, "epoch": 2 }, { "type": "loss", "content": 0.01987524703145027, "timestamp": "2025-09-30 22:06:57.794816", "step": 732, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:57.826959", "step": 732, "epoch": 2 }, { "type": "loss", "content": 0.01371761504560709, "timestamp": "2025-09-30 22:06:57.829307", "step": 733, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:57.860216", "step": 733, "epoch": 2 }, { "type": "loss", "content": 0.02026171050965786, "timestamp": "2025-09-30 22:06:57.864912", "step": 734, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:57.896655", "step": 734, "epoch": 2 }, { "type": "loss", "content": 0.009422077797353268, "timestamp": "2025-09-30 22:06:57.899037", "step": 735, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:57.931479", "step": 735, "epoch": 2 }, { "type": "loss", "content": 0.007785578723996878, "timestamp": "2025-09-30 22:06:57.956695", "step": 736, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:57.987650", "step": 736, "epoch": 2 }, { "type": "loss", "content": 0.01787823997437954, "timestamp": "2025-09-30 22:06:57.992954", "step": 737, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:58.022947", "step": 737, "epoch": 2 }, { "type": "loss", "content": 0.01570359617471695, "timestamp": "2025-09-30 22:06:58.026151", "step": 738, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:58.057153", "step": 738, "epoch": 2 }, { "type": "loss", "content": 0.005546352826058865, "timestamp": "2025-09-30 22:06:58.061668", "step": 739, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:58.093400", "step": 739, "epoch": 2 }, { "type": "loss", "content": 0.002934214426204562, "timestamp": "2025-09-30 22:06:58.118087", "step": 740, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:58.149705", "step": 740, "epoch": 2 }, { "type": "loss", "content": 0.017422713339328766, "timestamp": "2025-09-30 22:06:58.154336", "step": 741, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:06:58.767143", "step": 741, "epoch": 2 }, { "type": "pplx", "content": 89899745.22455357, "timestamp": "2025-09-30 22:06:58.769114", "step": 741, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:58.806015", "step": 741, "epoch": 2 }, { "type": "loss", "content": 0.009376317262649536, "timestamp": "2025-09-30 22:06:58.810769", "step": 742, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 288 ], "flops": 8543129804160 }, "timestamp": "2025-09-30 22:06:58.841216", "step": 742, "epoch": 2 }, { "type": "loss", "content": 0.010158979333937168, "timestamp": "2025-09-30 22:06:58.852371", "step": 743, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:58.883114", "step": 743, "epoch": 2 }, { "type": "loss", "content": 0.00397266773506999, "timestamp": "2025-09-30 22:06:58.908797", "step": 744, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:58.939482", "step": 744, "epoch": 2 }, { "type": "loss", "content": 0.04606813192367554, "timestamp": "2025-09-30 22:06:58.941486", "step": 745, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:58.972123", "step": 745, "epoch": 2 }, { "type": "loss", "content": 0.03550694137811661, "timestamp": "2025-09-30 22:06:58.976626", "step": 746, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:06:59.007952", "step": 746, "epoch": 2 }, { "type": "loss", "content": 0.015329292044043541, "timestamp": "2025-09-30 22:06:59.015089", "step": 747, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:06:59.046113", "step": 747, "epoch": 2 }, { "type": "loss", "content": 0.019602473825216293, "timestamp": "2025-09-30 22:06:59.070905", "step": 748, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:59.101710", "step": 748, "epoch": 2 }, { "type": "loss", "content": 0.0035281218588352203, "timestamp": "2025-09-30 22:06:59.106663", "step": 749, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:59.140814", "step": 749, "epoch": 2 }, { "type": "loss", "content": 0.0005557361873798072, "timestamp": "2025-09-30 22:06:59.145253", "step": 750, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:59.179556", "step": 750, "epoch": 2 }, { "type": "loss", "content": 0.0015190315898507833, "timestamp": "2025-09-30 22:06:59.183421", "step": 751, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:59.216433", "step": 751, "epoch": 2 }, { "type": "loss", "content": 0.0053230589255690575, "timestamp": "2025-09-30 22:06:59.243120", "step": 752, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:06:59.275132", "step": 752, "epoch": 2 }, { "type": "loss", "content": 0.009202539920806885, "timestamp": "2025-09-30 22:06:59.282949", "step": 753, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:59.314704", "step": 753, "epoch": 2 }, { "type": "loss", "content": 0.0009166055242531002, "timestamp": "2025-09-30 22:06:59.318987", "step": 754, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:59.351436", "step": 754, "epoch": 2 }, { "type": "loss", "content": 0.0019864339847117662, "timestamp": "2025-09-30 22:06:59.354462", "step": 755, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:59.387145", "step": 755, "epoch": 2 }, { "type": "loss", "content": 0.0035643898881971836, "timestamp": "2025-09-30 22:06:59.412191", "step": 756, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:06:59.444629", "step": 756, "epoch": 2 }, { "type": "loss", "content": 0.006387303117662668, "timestamp": "2025-09-30 22:06:59.449290", "step": 757, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:06:59.481167", "step": 757, "epoch": 2 }, { "type": "loss", "content": 0.05764467641711235, "timestamp": "2025-09-30 22:06:59.488748", "step": 758, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:59.520634", "step": 758, "epoch": 2 }, { "type": "loss", "content": 0.0008298616739921272, "timestamp": "2025-09-30 22:06:59.525089", "step": 759, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:06:59.556392", "step": 759, "epoch": 2 }, { "type": "loss", "content": 0.01580825448036194, "timestamp": "2025-09-30 22:06:59.580149", "step": 760, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:59.610452", "step": 760, "epoch": 2 }, { "type": "loss", "content": 0.0028829979710280895, "timestamp": "2025-09-30 22:06:59.613808", "step": 761, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:06:59.649586", "step": 761, "epoch": 2 }, { "type": "loss", "content": 0.06186835095286369, "timestamp": "2025-09-30 22:06:59.654187", "step": 762, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:59.686470", "step": 762, "epoch": 2 }, { "type": "loss", "content": 0.0009510466479696333, "timestamp": "2025-09-30 22:06:59.688793", "step": 763, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:06:59.719817", "step": 763, "epoch": 2 }, { "type": "loss", "content": 0.001385956653393805, "timestamp": "2025-09-30 22:06:59.743490", "step": 764, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:06:59.774084", "step": 764, "epoch": 2 }, { "type": "loss", "content": 0.021189337596297264, "timestamp": "2025-09-30 22:06:59.779826", "step": 765, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:59.810373", "step": 765, "epoch": 2 }, { "type": "loss", "content": 0.001855433569289744, "timestamp": "2025-09-30 22:06:59.812619", "step": 766, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:59.843261", "step": 766, "epoch": 2 }, { "type": "loss", "content": 0.0033974931575357914, "timestamp": "2025-09-30 22:06:59.845376", "step": 767, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:59.875644", "step": 767, "epoch": 2 }, { "type": "loss", "content": 0.024456506595015526, "timestamp": "2025-09-30 22:06:59.899138", "step": 768, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:06:59.929626", "step": 768, "epoch": 2 }, { "type": "loss", "content": 0.009055771864950657, "timestamp": "2025-09-30 22:06:59.931610", "step": 769, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:06:59.962127", "step": 769, "epoch": 2 }, { "type": "loss", "content": 0.005355574190616608, "timestamp": "2025-09-30 22:06:59.964077", "step": 770, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:06:59.994278", "step": 770, "epoch": 2 }, { "type": "loss", "content": 0.006609226576983929, "timestamp": "2025-09-30 22:06:59.998682", "step": 771, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:00.030577", "step": 771, "epoch": 2 }, { "type": "loss", "content": 0.014681574888527393, "timestamp": "2025-09-30 22:07:00.055805", "step": 772, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:00.087352", "step": 772, "epoch": 2 }, { "type": "loss", "content": 0.04299915209412575, "timestamp": "2025-09-30 22:07:00.089926", "step": 773, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:00.120826", "step": 773, "epoch": 2 }, { "type": "loss", "content": 0.0018289716681465507, "timestamp": "2025-09-30 22:07:00.123339", "step": 774, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:00.154141", "step": 774, "epoch": 2 }, { "type": "loss", "content": 0.029209045693278313, "timestamp": "2025-09-30 22:07:00.155948", "step": 775, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:00.186556", "step": 775, "epoch": 2 }, { "type": "loss", "content": 0.010646182112395763, "timestamp": "2025-09-30 22:07:00.210376", "step": 776, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:00.240818", "step": 776, "epoch": 2 }, { "type": "loss", "content": 0.037542395293712616, "timestamp": "2025-09-30 22:07:00.243006", "step": 777, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:00.273017", "step": 777, "epoch": 2 }, { "type": "loss", "content": 0.008672518655657768, "timestamp": "2025-09-30 22:07:00.277318", "step": 778, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:00.308323", "step": 778, "epoch": 2 }, { "type": "loss", "content": 0.012219659052789211, "timestamp": "2025-09-30 22:07:00.315325", "step": 779, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:00.345963", "step": 779, "epoch": 2 }, { "type": "loss", "content": 0.001221206970512867, "timestamp": "2025-09-30 22:07:00.369744", "step": 780, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:00.970836", "step": 780, "epoch": 2 }, { "type": "pplx", "content": 90265809.0697191, "timestamp": "2025-09-30 22:07:00.972470", "step": 780, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:01.000328", "step": 780, "epoch": 2 }, { "type": "loss", "content": 0.05539524182677269, "timestamp": "2025-09-30 22:07:01.003114", "step": 781, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:01.035736", "step": 781, "epoch": 2 }, { "type": "loss", "content": 0.03203495219349861, "timestamp": "2025-09-30 22:07:01.037895", "step": 782, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:01.074428", "step": 782, "epoch": 2 }, { "type": "loss", "content": 0.013802976347506046, "timestamp": "2025-09-30 22:07:01.081838", "step": 783, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:01.113051", "step": 783, "epoch": 2 }, { "type": "loss", "content": 0.017052913084626198, "timestamp": "2025-09-30 22:07:01.138288", "step": 784, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:01.169140", "step": 784, "epoch": 2 }, { "type": "loss", "content": 0.007087051402777433, "timestamp": "2025-09-30 22:07:01.171053", "step": 785, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:01.202539", "step": 785, "epoch": 2 }, { "type": "loss", "content": 0.022962896153330803, "timestamp": "2025-09-30 22:07:01.210296", "step": 786, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:01.240714", "step": 786, "epoch": 2 }, { "type": "loss", "content": 0.0020143541041761637, "timestamp": "2025-09-30 22:07:01.245325", "step": 787, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:01.275591", "step": 787, "epoch": 2 }, { "type": "loss", "content": 0.004092036280781031, "timestamp": "2025-09-30 22:07:01.299424", "step": 788, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:01.329115", "step": 788, "epoch": 2 }, { "type": "loss", "content": 0.014684909954667091, "timestamp": "2025-09-30 22:07:01.331208", "step": 789, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:01.365841", "step": 789, "epoch": 2 }, { "type": "loss", "content": 0.003241106402128935, "timestamp": "2025-09-30 22:07:01.370238", "step": 790, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:01.400945", "step": 790, "epoch": 2 }, { "type": "loss", "content": 0.015645338222384453, "timestamp": "2025-09-30 22:07:01.403706", "step": 791, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:01.434597", "step": 791, "epoch": 2 }, { "type": "loss", "content": 0.0030750453006476164, "timestamp": "2025-09-30 22:07:01.463080", "step": 792, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:01.496741", "step": 792, "epoch": 2 }, { "type": "loss", "content": 0.0076875039376318455, "timestamp": "2025-09-30 22:07:01.499595", "step": 793, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:01.530808", "step": 793, "epoch": 2 }, { "type": "loss", "content": 0.02010907046496868, "timestamp": "2025-09-30 22:07:01.533212", "step": 794, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:01.565119", "step": 794, "epoch": 2 }, { "type": "loss", "content": 0.014395522885024548, "timestamp": "2025-09-30 22:07:01.572445", "step": 795, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:01.602870", "step": 795, "epoch": 2 }, { "type": "loss", "content": 0.019928177818655968, "timestamp": "2025-09-30 22:07:01.628077", "step": 796, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:01.660832", "step": 796, "epoch": 2 }, { "type": "loss", "content": 0.036851465702056885, "timestamp": "2025-09-30 22:07:01.662989", "step": 797, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:01.700539", "step": 797, "epoch": 2 }, { "type": "loss", "content": 0.010141278617084026, "timestamp": "2025-09-30 22:07:01.705207", "step": 798, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:01.737319", "step": 798, "epoch": 2 }, { "type": "loss", "content": 0.0025164049584418535, "timestamp": "2025-09-30 22:07:01.740106", "step": 799, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:01.771272", "step": 799, "epoch": 2 }, { "type": "loss", "content": 0.019155463203787804, "timestamp": "2025-09-30 22:07:01.794744", "step": 800, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:01.826250", "step": 800, "epoch": 2 }, { "type": "loss", "content": 0.001676939777098596, "timestamp": "2025-09-30 22:07:01.829063", "step": 801, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:01.862176", "step": 801, "epoch": 2 }, { "type": "loss", "content": 0.0038142234552651644, "timestamp": "2025-09-30 22:07:01.866392", "step": 802, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:01.898311", "step": 802, "epoch": 2 }, { "type": "loss", "content": 0.008628753945231438, "timestamp": "2025-09-30 22:07:01.901063", "step": 803, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:01.934177", "step": 803, "epoch": 2 }, { "type": "loss", "content": 0.022448832169175148, "timestamp": "2025-09-30 22:07:01.959464", "step": 804, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:01.994813", "step": 804, "epoch": 2 }, { "type": "loss", "content": 0.03241322934627533, "timestamp": "2025-09-30 22:07:02.000017", "step": 805, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:02.033172", "step": 805, "epoch": 2 }, { "type": "loss", "content": 0.005572688765823841, "timestamp": "2025-09-30 22:07:02.035635", "step": 806, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:02.066587", "step": 806, "epoch": 2 }, { "type": "loss", "content": 0.0072050197049975395, "timestamp": "2025-09-30 22:07:02.070044", "step": 807, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:02.103462", "step": 807, "epoch": 2 }, { "type": "loss", "content": 0.009295523166656494, "timestamp": "2025-09-30 22:07:02.128733", "step": 808, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:02.161898", "step": 808, "epoch": 2 }, { "type": "loss", "content": 0.014571547508239746, "timestamp": "2025-09-30 22:07:02.163856", "step": 809, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:02.197047", "step": 809, "epoch": 2 }, { "type": "loss", "content": 0.015193559229373932, "timestamp": "2025-09-30 22:07:02.200539", "step": 810, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:02.235139", "step": 810, "epoch": 2 }, { "type": "loss", "content": 0.005540668033063412, "timestamp": "2025-09-30 22:07:02.242422", "step": 811, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:02.274900", "step": 811, "epoch": 2 }, { "type": "loss", "content": 0.008927380666136742, "timestamp": "2025-09-30 22:07:02.300147", "step": 812, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:02.334365", "step": 812, "epoch": 2 }, { "type": "loss", "content": 0.006226464174687862, "timestamp": "2025-09-30 22:07:02.337229", "step": 813, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:02.368031", "step": 813, "epoch": 2 }, { "type": "loss", "content": 0.02287706360220909, "timestamp": "2025-09-30 22:07:02.370472", "step": 814, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:02.402790", "step": 814, "epoch": 2 }, { "type": "loss", "content": 0.02037571184337139, "timestamp": "2025-09-30 22:07:02.405310", "step": 815, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:02.437854", "step": 815, "epoch": 2 }, { "type": "loss", "content": 0.005328350700438023, "timestamp": "2025-09-30 22:07:02.469644", "step": 816, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:02.501613", "step": 816, "epoch": 2 }, { "type": "loss", "content": 0.019001901149749756, "timestamp": "2025-09-30 22:07:02.505118", "step": 817, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:02.538290", "step": 817, "epoch": 2 }, { "type": "loss", "content": 0.0028542112559080124, "timestamp": "2025-09-30 22:07:02.540778", "step": 818, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:02.572538", "step": 818, "epoch": 2 }, { "type": "loss", "content": 0.015463702380657196, "timestamp": "2025-09-30 22:07:02.577204", "step": 819, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:03.192026", "step": 819, "epoch": 2 }, { "type": "pplx", "content": 84273412.49839544, "timestamp": "2025-09-30 22:07:03.195515", "step": 819, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:03.226123", "step": 819, "epoch": 2 }, { "type": "loss", "content": 0.010286093689501286, "timestamp": "2025-09-30 22:07:03.251787", "step": 820, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:03.286105", "step": 820, "epoch": 2 }, { "type": "loss", "content": 0.040026549249887466, "timestamp": "2025-09-30 22:07:03.288272", "step": 821, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:03.320426", "step": 821, "epoch": 2 }, { "type": "loss", "content": 0.0017085708677768707, "timestamp": "2025-09-30 22:07:03.327568", "step": 822, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:03.360075", "step": 822, "epoch": 2 }, { "type": "loss", "content": 0.002073936630040407, "timestamp": "2025-09-30 22:07:03.367245", "step": 823, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:03.403097", "step": 823, "epoch": 2 }, { "type": "loss", "content": 0.012383386492729187, "timestamp": "2025-09-30 22:07:03.431148", "step": 824, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:03.465505", "step": 824, "epoch": 2 }, { "type": "loss", "content": 0.005613238550722599, "timestamp": "2025-09-30 22:07:03.467662", "step": 825, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:03.499264", "step": 825, "epoch": 2 }, { "type": "loss", "content": 0.010608835145831108, "timestamp": "2025-09-30 22:07:03.501447", "step": 826, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:03.533710", "step": 826, "epoch": 2 }, { "type": "loss", "content": 0.016609007492661476, "timestamp": "2025-09-30 22:07:03.536995", "step": 827, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:03.570057", "step": 827, "epoch": 2 }, { "type": "loss", "content": 0.01166166365146637, "timestamp": "2025-09-30 22:07:03.594362", "step": 828, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:03.626169", "step": 828, "epoch": 2 }, { "type": "loss", "content": 0.013758150860667229, "timestamp": "2025-09-30 22:07:03.631126", "step": 829, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:03.662702", "step": 829, "epoch": 2 }, { "type": "loss", "content": 0.03357986733317375, "timestamp": "2025-09-30 22:07:03.665896", "step": 830, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:03.697811", "step": 830, "epoch": 2 }, { "type": "loss", "content": 0.019499024376273155, "timestamp": "2025-09-30 22:07:03.705171", "step": 831, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:03.740713", "step": 831, "epoch": 2 }, { "type": "loss", "content": 0.01842236891388893, "timestamp": "2025-09-30 22:07:03.764721", "step": 832, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:03.797386", "step": 832, "epoch": 2 }, { "type": "loss", "content": 0.018398495391011238, "timestamp": "2025-09-30 22:07:03.800051", "step": 833, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:03.831348", "step": 833, "epoch": 2 }, { "type": "loss", "content": 0.019753288477659225, "timestamp": "2025-09-30 22:07:03.838535", "step": 834, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:03.873976", "step": 834, "epoch": 2 }, { "type": "loss", "content": 0.0059713092632591724, "timestamp": "2025-09-30 22:07:03.876889", "step": 835, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:03.908398", "step": 835, "epoch": 2 }, { "type": "loss", "content": 0.008703912608325481, "timestamp": "2025-09-30 22:07:03.931557", "step": 836, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:03.963534", "step": 836, "epoch": 2 }, { "type": "loss", "content": 0.022324806079268456, "timestamp": "2025-09-30 22:07:03.966052", "step": 837, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:03.996077", "step": 837, "epoch": 2 }, { "type": "loss", "content": 0.01000311691313982, "timestamp": "2025-09-30 22:07:03.998268", "step": 838, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:04.031057", "step": 838, "epoch": 2 }, { "type": "loss", "content": 0.0029391685966402292, "timestamp": "2025-09-30 22:07:04.033715", "step": 839, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:04.066955", "step": 839, "epoch": 2 }, { "type": "loss", "content": 0.005705764051526785, "timestamp": "2025-09-30 22:07:04.090396", "step": 840, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:04.122879", "step": 840, "epoch": 2 }, { "type": "loss", "content": 0.003614378860220313, "timestamp": "2025-09-30 22:07:04.128426", "step": 841, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:04.163958", "step": 841, "epoch": 2 }, { "type": "loss", "content": 0.0036180841270834208, "timestamp": "2025-09-30 22:07:04.168311", "step": 842, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:04.203861", "step": 842, "epoch": 2 }, { "type": "loss", "content": 0.015567810274660587, "timestamp": "2025-09-30 22:07:04.211802", "step": 843, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:04.242892", "step": 843, "epoch": 2 }, { "type": "loss", "content": 0.018757859244942665, "timestamp": "2025-09-30 22:07:04.268474", "step": 844, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:07:04.299760", "step": 844, "epoch": 2 }, { "type": "loss", "content": 0.012565111741423607, "timestamp": "2025-09-30 22:07:04.307580", "step": 845, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:04.338572", "step": 845, "epoch": 2 }, { "type": "loss", "content": 0.010797788389027119, "timestamp": "2025-09-30 22:07:04.345768", "step": 846, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:04.378499", "step": 846, "epoch": 2 }, { "type": "loss", "content": 0.015541709028184414, "timestamp": "2025-09-30 22:07:04.385672", "step": 847, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:04.416407", "step": 847, "epoch": 2 }, { "type": "loss", "content": 0.03071592189371586, "timestamp": "2025-09-30 22:07:04.440055", "step": 848, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:04.471043", "step": 848, "epoch": 2 }, { "type": "loss", "content": 0.00239894213154912, "timestamp": "2025-09-30 22:07:04.472802", "step": 849, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:04.512397", "step": 849, "epoch": 2 }, { "type": "loss", "content": 0.029214853420853615, "timestamp": "2025-09-30 22:07:04.516927", "step": 850, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:04.555011", "step": 850, "epoch": 2 }, { "type": "loss", "content": 0.017962682992219925, "timestamp": "2025-09-30 22:07:04.556939", "step": 851, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:04.587609", "step": 851, "epoch": 2 }, { "type": "loss", "content": 0.002266437280923128, "timestamp": "2025-09-30 22:07:04.615918", "step": 852, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:04.647708", "step": 852, "epoch": 2 }, { "type": "loss", "content": 0.011912612244486809, "timestamp": "2025-09-30 22:07:04.649717", "step": 853, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:04.680773", "step": 853, "epoch": 2 }, { "type": "loss", "content": 0.010540487244725227, "timestamp": "2025-09-30 22:07:04.682893", "step": 854, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:04.714393", "step": 854, "epoch": 2 }, { "type": "loss", "content": 0.0030493498779833317, "timestamp": "2025-09-30 22:07:04.722045", "step": 855, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:04.755199", "step": 855, "epoch": 2 }, { "type": "loss", "content": 0.007540632504969835, "timestamp": "2025-09-30 22:07:04.783748", "step": 856, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:04.817129", "step": 856, "epoch": 2 }, { "type": "loss", "content": 0.02822980284690857, "timestamp": "2025-09-30 22:07:04.819244", "step": 857, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:04.852753", "step": 857, "epoch": 2 }, { "type": "loss", "content": 0.02981366217136383, "timestamp": "2025-09-30 22:07:04.855144", "step": 858, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:05.549713", "step": 858, "epoch": 2 }, { "type": "pplx", "content": 86256591.69055031, "timestamp": "2025-09-30 22:07:05.551227", "step": 858, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:05.580616", "step": 858, "epoch": 2 }, { "type": "loss", "content": 0.0039618550799787045, "timestamp": "2025-09-30 22:07:05.588128", "step": 859, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:05.620463", "step": 859, "epoch": 2 }, { "type": "loss", "content": 0.030887721106410027, "timestamp": "2025-09-30 22:07:05.645889", "step": 860, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:05.680208", "step": 860, "epoch": 2 }, { "type": "loss", "content": 0.017998460680246353, "timestamp": "2025-09-30 22:07:05.682308", "step": 861, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:05.719932", "step": 861, "epoch": 2 }, { "type": "loss", "content": 0.0152524309232831, "timestamp": "2025-09-30 22:07:05.722640", "step": 862, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:05.761132", "step": 862, "epoch": 2 }, { "type": "loss", "content": 0.01634639874100685, "timestamp": "2025-09-30 22:07:05.765485", "step": 863, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:05.804411", "step": 863, "epoch": 2 }, { "type": "loss", "content": 0.010691942647099495, "timestamp": "2025-09-30 22:07:05.832323", "step": 864, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:05.864370", "step": 864, "epoch": 2 }, { "type": "loss", "content": 0.004576417151838541, "timestamp": "2025-09-30 22:07:05.866430", "step": 865, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:05.897941", "step": 865, "epoch": 2 }, { "type": "loss", "content": 0.02053973264992237, "timestamp": "2025-09-30 22:07:05.905133", "step": 866, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:05.943773", "step": 866, "epoch": 2 }, { "type": "loss", "content": 0.011172892525792122, "timestamp": "2025-09-30 22:07:05.948064", "step": 867, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:05.979520", "step": 867, "epoch": 2 }, { "type": "loss", "content": 0.00478165689855814, "timestamp": "2025-09-30 22:07:06.007609", "step": 868, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:06.046473", "step": 868, "epoch": 2 }, { "type": "loss", "content": 0.00597166595980525, "timestamp": "2025-09-30 22:07:06.051835", "step": 869, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:06.084572", "step": 869, "epoch": 2 }, { "type": "loss", "content": 0.014478781260550022, "timestamp": "2025-09-30 22:07:06.088892", "step": 870, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:06.121316", "step": 870, "epoch": 2 }, { "type": "loss", "content": 0.008051961660385132, "timestamp": "2025-09-30 22:07:06.125304", "step": 871, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:06.166173", "step": 871, "epoch": 2 }, { "type": "loss", "content": 0.0020913986954838037, "timestamp": "2025-09-30 22:07:06.194128", "step": 872, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:06.228988", "step": 872, "epoch": 2 }, { "type": "loss", "content": 0.004229079000651836, "timestamp": "2025-09-30 22:07:06.230955", "step": 873, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:06.262070", "step": 873, "epoch": 2 }, { "type": "loss", "content": 0.00472915219143033, "timestamp": "2025-09-30 22:07:06.264968", "step": 874, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:06.296056", "step": 874, "epoch": 2 }, { "type": "loss", "content": 0.009399129077792168, "timestamp": "2025-09-30 22:07:06.297968", "step": 875, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:06.337125", "step": 875, "epoch": 2 }, { "type": "loss", "content": 0.017205609008669853, "timestamp": "2025-09-30 22:07:06.360732", "step": 876, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:06.394265", "step": 876, "epoch": 2 }, { "type": "loss", "content": 0.019868528470396996, "timestamp": "2025-09-30 22:07:06.396608", "step": 877, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:06.430534", "step": 877, "epoch": 2 }, { "type": "loss", "content": 0.020020518451929092, "timestamp": "2025-09-30 22:07:06.432585", "step": 878, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:06.469670", "step": 878, "epoch": 2 }, { "type": "loss", "content": 0.0024727715644985437, "timestamp": "2025-09-30 22:07:06.471774", "step": 879, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:06.504053", "step": 879, "epoch": 2 }, { "type": "loss", "content": 0.002944986103102565, "timestamp": "2025-09-30 22:07:06.531947", "step": 880, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:06.568330", "step": 880, "epoch": 2 }, { "type": "loss", "content": 0.004082814324647188, "timestamp": "2025-09-30 22:07:06.573756", "step": 881, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:06.605310", "step": 881, "epoch": 2 }, { "type": "loss", "content": 0.004995688796043396, "timestamp": "2025-09-30 22:07:06.607374", "step": 882, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:06.641713", "step": 882, "epoch": 2 }, { "type": "loss", "content": 0.004161830525845289, "timestamp": "2025-09-30 22:07:06.648484", "step": 883, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:06.688386", "step": 883, "epoch": 2 }, { "type": "loss", "content": 0.005772753152996302, "timestamp": "2025-09-30 22:07:06.712059", "step": 884, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:06.743909", "step": 884, "epoch": 2 }, { "type": "loss", "content": 0.003844219958409667, "timestamp": "2025-09-30 22:07:06.746374", "step": 885, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:06.778035", "step": 885, "epoch": 2 }, { "type": "loss", "content": 0.00576426088809967, "timestamp": "2025-09-30 22:07:06.780467", "step": 886, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:06.819822", "step": 886, "epoch": 2 }, { "type": "loss", "content": 0.02476349100470543, "timestamp": "2025-09-30 22:07:06.822369", "step": 887, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:06.861347", "step": 887, "epoch": 2 }, { "type": "loss", "content": 0.005247528199106455, "timestamp": "2025-09-30 22:07:06.885046", "step": 888, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:06.916521", "step": 888, "epoch": 2 }, { "type": "loss", "content": 0.012756898067891598, "timestamp": "2025-09-30 22:07:06.918584", "step": 889, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:06.950976", "step": 889, "epoch": 2 }, { "type": "loss", "content": 0.0024588818196207285, "timestamp": "2025-09-30 22:07:06.955126", "step": 890, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:06.990935", "step": 890, "epoch": 2 }, { "type": "loss", "content": 0.004403135273605585, "timestamp": "2025-09-30 22:07:06.992901", "step": 891, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:07.027449", "step": 891, "epoch": 2 }, { "type": "loss", "content": 0.0157319363206625, "timestamp": "2025-09-30 22:07:07.052689", "step": 892, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:07.085998", "step": 892, "epoch": 2 }, { "type": "loss", "content": 0.0017558409599587321, "timestamp": "2025-09-30 22:07:07.088108", "step": 893, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:07.124712", "step": 893, "epoch": 2 }, { "type": "loss", "content": 0.004555154126137495, "timestamp": "2025-09-30 22:07:07.129197", "step": 894, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:07.163297", "step": 894, "epoch": 2 }, { "type": "loss", "content": 0.013950363732874393, "timestamp": "2025-09-30 22:07:07.170138", "step": 895, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:07.201824", "step": 895, "epoch": 2 }, { "type": "loss", "content": 0.0023241452872753143, "timestamp": "2025-09-30 22:07:07.230501", "step": 896, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:07.264682", "step": 896, "epoch": 2 }, { "type": "loss", "content": 0.0018783170962706208, "timestamp": "2025-09-30 22:07:07.269445", "step": 897, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:07.945628", "step": 897, "epoch": 2 }, { "type": "pplx", "content": 95768593.83513556, "timestamp": "2025-09-30 22:07:07.947542", "step": 897, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:07.977709", "step": 897, "epoch": 2 }, { "type": "loss", "content": 0.0018268562853336334, "timestamp": "2025-09-30 22:07:07.979939", "step": 898, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:07:08.016661", "step": 898, "epoch": 2 }, { "type": "loss", "content": 0.000786967168096453, "timestamp": "2025-09-30 22:07:08.026834", "step": 899, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:08.061996", "step": 899, "epoch": 2 }, { "type": "loss", "content": 0.0017735311994329095, "timestamp": "2025-09-30 22:07:08.085681", "step": 900, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:08.117983", "step": 900, "epoch": 2 }, { "type": "loss", "content": 0.0028034248389303684, "timestamp": "2025-09-30 22:07:08.123092", "step": 901, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:08.155456", "step": 901, "epoch": 2 }, { "type": "loss", "content": 0.0053888121619820595, "timestamp": "2025-09-30 22:07:08.159956", "step": 902, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:08.191966", "step": 902, "epoch": 2 }, { "type": "loss", "content": 0.0014559318078681827, "timestamp": "2025-09-30 22:07:08.196228", "step": 903, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:08.227596", "step": 903, "epoch": 2 }, { "type": "loss", "content": 0.0028973512817174196, "timestamp": "2025-09-30 22:07:08.253063", "step": 904, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:08.285797", "step": 904, "epoch": 2 }, { "type": "loss", "content": 0.0038677144329994917, "timestamp": "2025-09-30 22:07:08.287802", "step": 905, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:08.323927", "step": 905, "epoch": 2 }, { "type": "loss", "content": 0.0034440092276781797, "timestamp": "2025-09-30 22:07:08.331827", "step": 906, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-30 22:07:08.363498", "step": 906, "epoch": 2 }, { "type": "loss", "content": 0.0014499167446047068, "timestamp": "2025-09-30 22:07:08.375723", "step": 907, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:08.407391", "step": 907, "epoch": 2 }, { "type": "loss", "content": 0.0017258430598303676, "timestamp": "2025-09-30 22:07:08.432998", "step": 908, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:08.463947", "step": 908, "epoch": 2 }, { "type": "loss", "content": 0.0020710090175271034, "timestamp": "2025-09-30 22:07:08.468648", "step": 909, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:08.499735", "step": 909, "epoch": 2 }, { "type": "loss", "content": 0.0018089372897520661, "timestamp": "2025-09-30 22:07:08.506547", "step": 910, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:08.541538", "step": 910, "epoch": 2 }, { "type": "loss", "content": 0.0026987947057932615, "timestamp": "2025-09-30 22:07:08.546322", "step": 911, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:08.580949", "step": 911, "epoch": 2 }, { "type": "loss", "content": 0.0031114660669118166, "timestamp": "2025-09-30 22:07:08.606421", "step": 912, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:08.637806", "step": 912, "epoch": 2 }, { "type": "loss", "content": 0.008608967997133732, "timestamp": "2025-09-30 22:07:08.640045", "step": 913, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:08.673689", "step": 913, "epoch": 2 }, { "type": "loss", "content": 0.00260666711255908, "timestamp": "2025-09-30 22:07:08.676010", "step": 914, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:08.708330", "step": 914, "epoch": 2 }, { "type": "loss", "content": 0.0011994466185569763, "timestamp": "2025-09-30 22:07:08.711152", "step": 915, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:08.743385", "step": 915, "epoch": 2 }, { "type": "loss", "content": 0.004028329625725746, "timestamp": "2025-09-30 22:07:08.767783", "step": 916, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:08.802267", "step": 916, "epoch": 2 }, { "type": "loss", "content": 0.0007758959545753896, "timestamp": "2025-09-30 22:07:08.804337", "step": 917, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:08.836653", "step": 917, "epoch": 2 }, { "type": "loss", "content": 0.0007386510260403156, "timestamp": "2025-09-30 22:07:08.843829", "step": 918, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:08.883975", "step": 918, "epoch": 2 }, { "type": "loss", "content": 0.0027365325950086117, "timestamp": "2025-09-30 22:07:08.886371", "step": 919, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:08.924596", "step": 919, "epoch": 2 }, { "type": "loss", "content": 0.0016872246051207185, "timestamp": "2025-09-30 22:07:08.948515", "step": 920, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:08.980561", "step": 920, "epoch": 2 }, { "type": "loss", "content": 0.0009407888865098357, "timestamp": "2025-09-30 22:07:08.983138", "step": 921, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:09.017094", "step": 921, "epoch": 2 }, { "type": "loss", "content": 0.00036402264959178865, "timestamp": "2025-09-30 22:07:09.021560", "step": 922, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:09.055955", "step": 922, "epoch": 2 }, { "type": "loss", "content": 0.0008210391388274729, "timestamp": "2025-09-30 22:07:09.063443", "step": 923, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:09.097989", "step": 923, "epoch": 2 }, { "type": "loss", "content": 0.001234310562722385, "timestamp": "2025-09-30 22:07:09.123744", "step": 924, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:09.158919", "step": 924, "epoch": 2 }, { "type": "loss", "content": 0.004231620114296675, "timestamp": "2025-09-30 22:07:09.161095", "step": 925, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:09.193921", "step": 925, "epoch": 2 }, { "type": "loss", "content": 0.0013935185270383954, "timestamp": "2025-09-30 22:07:09.197666", "step": 926, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:09.229611", "step": 926, "epoch": 2 }, { "type": "loss", "content": 0.00570475310087204, "timestamp": "2025-09-30 22:07:09.232139", "step": 927, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:09.263920", "step": 927, "epoch": 2 }, { "type": "loss", "content": 0.00038181617856025696, "timestamp": "2025-09-30 22:07:09.287318", "step": 928, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:09.318675", "step": 928, "epoch": 2 }, { "type": "loss", "content": 0.0002547978365328163, "timestamp": "2025-09-30 22:07:09.323157", "step": 929, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:09.356607", "step": 929, "epoch": 2 }, { "type": "loss", "content": 0.003905776422470808, "timestamp": "2025-09-30 22:07:09.363386", "step": 930, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:09.394811", "step": 930, "epoch": 2 }, { "type": "loss", "content": 0.000579020765144378, "timestamp": "2025-09-30 22:07:09.401676", "step": 931, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:09.433557", "step": 931, "epoch": 2 }, { "type": "loss", "content": 0.005851599853485823, "timestamp": "2025-09-30 22:07:09.458583", "step": 932, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:09.489645", "step": 932, "epoch": 2 }, { "type": "loss", "content": 0.0011362012010067701, "timestamp": "2025-09-30 22:07:09.494109", "step": 933, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:09.533288", "step": 933, "epoch": 2 }, { "type": "loss", "content": 0.004097456112504005, "timestamp": "2025-09-30 22:07:09.535419", "step": 934, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:09.569467", "step": 934, "epoch": 2 }, { "type": "loss", "content": 0.0016320300055667758, "timestamp": "2025-09-30 22:07:09.571675", "step": 935, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:09.602463", "step": 935, "epoch": 2 }, { "type": "loss", "content": 0.0024072679225355387, "timestamp": "2025-09-30 22:07:09.626031", "step": 936, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:10.327478", "step": 936, "epoch": 2 }, { "type": "pplx", "content": 112929219.40033711, "timestamp": "2025-09-30 22:07:10.329310", "step": 936, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:10.360088", "step": 936, "epoch": 2 }, { "type": "loss", "content": 0.0004962026723660529, "timestamp": "2025-09-30 22:07:10.362276", "step": 937, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:10.406581", "step": 937, "epoch": 2 }, { "type": "loss", "content": 0.00023230533406604081, "timestamp": "2025-09-30 22:07:10.410755", "step": 938, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:10.451027", "step": 938, "epoch": 2 }, { "type": "loss", "content": 0.0002659259189385921, "timestamp": "2025-09-30 22:07:10.455539", "step": 939, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:10.490463", "step": 939, "epoch": 2 }, { "type": "loss", "content": 0.00033930627978406847, "timestamp": "2025-09-30 22:07:10.514420", "step": 940, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:10.556888", "step": 940, "epoch": 2 }, { "type": "loss", "content": 0.0016134243924170732, "timestamp": "2025-09-30 22:07:10.559086", "step": 941, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:10.590226", "step": 941, "epoch": 2 }, { "type": "loss", "content": 0.0004938853089697659, "timestamp": "2025-09-30 22:07:10.592117", "step": 942, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:10.631199", "step": 942, "epoch": 2 }, { "type": "loss", "content": 0.0002870448224712163, "timestamp": "2025-09-30 22:07:10.633506", "step": 943, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:10.670751", "step": 943, "epoch": 2 }, { "type": "loss", "content": 0.003749014111235738, "timestamp": "2025-09-30 22:07:10.694301", "step": 944, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:10.736657", "step": 944, "epoch": 2 }, { "type": "loss", "content": 0.0003916143032256514, "timestamp": "2025-09-30 22:07:10.741782", "step": 945, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:10.779978", "step": 945, "epoch": 2 }, { "type": "loss", "content": 0.00022652934421785176, "timestamp": "2025-09-30 22:07:10.784524", "step": 946, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:10.827099", "step": 946, "epoch": 2 }, { "type": "loss", "content": 0.0004099583311472088, "timestamp": "2025-09-30 22:07:10.832155", "step": 947, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:10.871557", "step": 947, "epoch": 2 }, { "type": "loss", "content": 0.000650174799375236, "timestamp": "2025-09-30 22:07:10.896952", "step": 948, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:10.942709", "step": 948, "epoch": 2 }, { "type": "loss", "content": 0.002298450330272317, "timestamp": "2025-09-30 22:07:10.944667", "step": 949, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:10.978710", "step": 949, "epoch": 2 }, { "type": "loss", "content": 0.00034443842014297843, "timestamp": "2025-09-30 22:07:10.980518", "step": 950, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:11.013027", "step": 950, "epoch": 2 }, { "type": "loss", "content": 0.00355920079164207, "timestamp": "2025-09-30 22:07:11.015463", "step": 951, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:11.047952", "step": 951, "epoch": 2 }, { "type": "loss", "content": 0.00011734906729543582, "timestamp": "2025-09-30 22:07:11.076482", "step": 952, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:11.107678", "step": 952, "epoch": 2 }, { "type": "loss", "content": 0.0001595637295395136, "timestamp": "2025-09-30 22:07:11.110152", "step": 953, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:11.147666", "step": 953, "epoch": 2 }, { "type": "loss", "content": 0.0003285036946181208, "timestamp": "2025-09-30 22:07:11.150055", "step": 954, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:11.188918", "step": 954, "epoch": 2 }, { "type": "loss", "content": 0.0007600878598168492, "timestamp": "2025-09-30 22:07:11.190903", "step": 955, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:11.224395", "step": 955, "epoch": 2 }, { "type": "loss", "content": 0.00014429469592869282, "timestamp": "2025-09-30 22:07:11.252477", "step": 956, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:11.289600", "step": 956, "epoch": 2 }, { "type": "loss", "content": 0.005687530618160963, "timestamp": "2025-09-30 22:07:11.291958", "step": 957, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:11.326919", "step": 957, "epoch": 2 }, { "type": "loss", "content": 0.001010962063446641, "timestamp": "2025-09-30 22:07:11.329592", "step": 958, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:11.364523", "step": 958, "epoch": 2 }, { "type": "loss", "content": 0.0005010986351408064, "timestamp": "2025-09-30 22:07:11.367667", "step": 959, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:11.402225", "step": 959, "epoch": 2 }, { "type": "loss", "content": 0.001585278776474297, "timestamp": "2025-09-30 22:07:11.426244", "step": 960, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:11.460298", "step": 960, "epoch": 2 }, { "type": "loss", "content": 0.00010611403558868915, "timestamp": "2025-09-30 22:07:11.464889", "step": 961, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:11.497324", "step": 961, "epoch": 2 }, { "type": "loss", "content": 0.00093518674839288, "timestamp": "2025-09-30 22:07:11.499809", "step": 962, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:11.539635", "step": 962, "epoch": 2 }, { "type": "loss", "content": 0.00011386203550500795, "timestamp": "2025-09-30 22:07:11.542572", "step": 963, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:11.576198", "step": 963, "epoch": 2 }, { "type": "loss", "content": 0.01964985392987728, "timestamp": "2025-09-30 22:07:11.604288", "step": 964, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:11.637144", "step": 964, "epoch": 2 }, { "type": "loss", "content": 0.002611854812130332, "timestamp": "2025-09-30 22:07:11.638996", "step": 965, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:11.671779", "step": 965, "epoch": 2 }, { "type": "loss", "content": 0.00027620504260994494, "timestamp": "2025-09-30 22:07:11.676059", "step": 966, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:11.710289", "step": 966, "epoch": 2 }, { "type": "loss", "content": 0.0017094091745093465, "timestamp": "2025-09-30 22:07:11.713181", "step": 967, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:11.746173", "step": 967, "epoch": 2 }, { "type": "loss", "content": 9.690089063951746e-05, "timestamp": "2025-09-30 22:07:11.774308", "step": 968, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:11.815616", "step": 968, "epoch": 2 }, { "type": "loss", "content": 0.0009474550024606287, "timestamp": "2025-09-30 22:07:11.818783", "step": 969, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:11.852885", "step": 969, "epoch": 2 }, { "type": "loss", "content": 0.00027702722582034767, "timestamp": "2025-09-30 22:07:11.855700", "step": 970, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:11.888806", "step": 970, "epoch": 2 }, { "type": "loss", "content": 9.48179658735171e-05, "timestamp": "2025-09-30 22:07:11.893370", "step": 971, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:11.927029", "step": 971, "epoch": 2 }, { "type": "loss", "content": 0.01434251107275486, "timestamp": "2025-09-30 22:07:11.951056", "step": 972, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:11.986648", "step": 972, "epoch": 2 }, { "type": "loss", "content": 0.00014298749738372862, "timestamp": "2025-09-30 22:07:11.991438", "step": 973, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:12.034567", "step": 973, "epoch": 2 }, { "type": "loss", "content": 8.121971768559888e-05, "timestamp": "2025-09-30 22:07:12.038808", "step": 974, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:12.073593", "step": 974, "epoch": 2 }, { "type": "loss", "content": 0.00010338701395085081, "timestamp": "2025-09-30 22:07:12.080501", "step": 975, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:12.812417", "step": 975, "epoch": 2 }, { "type": "pplx", "content": 121873066.00614506, "timestamp": "2025-09-30 22:07:12.814660", "step": 975, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:12.850025", "step": 975, "epoch": 2 }, { "type": "loss", "content": 0.007562555372714996, "timestamp": "2025-09-30 22:07:12.873547", "step": 976, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:12.913828", "step": 976, "epoch": 2 }, { "type": "loss", "content": 0.03160439059138298, "timestamp": "2025-09-30 22:07:12.916224", "step": 977, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:12.955925", "step": 977, "epoch": 2 }, { "type": "loss", "content": 0.000496004126034677, "timestamp": "2025-09-30 22:07:12.958103", "step": 978, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:12.995712", "step": 978, "epoch": 2 }, { "type": "loss", "content": 0.0037520842161029577, "timestamp": "2025-09-30 22:07:13.003017", "step": 979, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:13.036947", "step": 979, "epoch": 2 }, { "type": "loss", "content": 0.0006964383646845818, "timestamp": "2025-09-30 22:07:13.060837", "step": 980, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:13.095302", "step": 980, "epoch": 2 }, { "type": "loss", "content": 0.01626410521566868, "timestamp": "2025-09-30 22:07:13.097441", "step": 981, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:13.130837", "step": 981, "epoch": 2 }, { "type": "loss", "content": 0.00018333212938159704, "timestamp": "2025-09-30 22:07:13.133238", "step": 982, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:13.188556", "step": 982, "epoch": 2 }, { "type": "loss", "content": 7.79297188273631e-05, "timestamp": "2025-09-30 22:07:13.196330", "step": 983, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:13.241086", "step": 983, "epoch": 2 }, { "type": "loss", "content": 0.004288059659302235, "timestamp": "2025-09-30 22:07:13.264670", "step": 984, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:13.304500", "step": 984, "epoch": 2 }, { "type": "loss", "content": 0.0007159236702136695, "timestamp": "2025-09-30 22:07:13.306981", "step": 985, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:13.339972", "step": 985, "epoch": 2 }, { "type": "loss", "content": 0.00033975281985476613, "timestamp": "2025-09-30 22:07:13.344327", "step": 986, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:13.376246", "step": 986, "epoch": 2 }, { "type": "loss", "content": 0.03457098826766014, "timestamp": "2025-09-30 22:07:13.378249", "step": 987, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:13.420408", "step": 987, "epoch": 2 }, { "type": "loss", "content": 0.0003113284183200449, "timestamp": "2025-09-30 22:07:13.443936", "step": 988, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:13.483624", "step": 988, "epoch": 2 }, { "type": "loss", "content": 0.0004182211123406887, "timestamp": "2025-09-30 22:07:13.485802", "step": 989, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:13.526373", "step": 989, "epoch": 2 }, { "type": "loss", "content": 0.00030641566263511777, "timestamp": "2025-09-30 22:07:13.528735", "step": 990, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:13.568696", "step": 990, "epoch": 2 }, { "type": "loss", "content": 0.010180122219026089, "timestamp": "2025-09-30 22:07:13.573135", "step": 991, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:13.607697", "step": 991, "epoch": 2 }, { "type": "loss", "content": 0.00035741503234021366, "timestamp": "2025-09-30 22:07:13.631468", "step": 992, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:13.663681", "step": 992, "epoch": 2 }, { "type": "loss", "content": 0.003032468957826495, "timestamp": "2025-09-30 22:07:13.665618", "step": 993, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:13.697780", "step": 993, "epoch": 2 }, { "type": "loss", "content": 0.0021785900462418795, "timestamp": "2025-09-30 22:07:13.702360", "step": 994, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:13.736901", "step": 994, "epoch": 2 }, { "type": "loss", "content": 0.014290996827185154, "timestamp": "2025-09-30 22:07:13.738898", "step": 995, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:13.776796", "step": 995, "epoch": 2 }, { "type": "loss", "content": 0.002437558025121689, "timestamp": "2025-09-30 22:07:13.800364", "step": 996, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:13.834493", "step": 996, "epoch": 2 }, { "type": "loss", "content": 0.057113997638225555, "timestamp": "2025-09-30 22:07:13.836581", "step": 997, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:13.886520", "step": 997, "epoch": 2 }, { "type": "loss", "content": 0.010231812484562397, "timestamp": "2025-09-30 22:07:13.891076", "step": 998, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:13.925378", "step": 998, "epoch": 2 }, { "type": "loss", "content": 0.00013605566346086562, "timestamp": "2025-09-30 22:07:13.932229", "step": 999, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:13.980495", "step": 999, "epoch": 2 }, { "type": "loss", "content": 0.0021123080514371395, "timestamp": "2025-09-30 22:07:14.005860", "step": 1000, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1000", "timestamp": "2025-09-30 22:07:19.036825", "step": 1000, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:19.072988", "step": 1000, "epoch": 2 }, { "type": "loss", "content": 0.0002514692605473101, "timestamp": "2025-09-30 22:07:19.077266", "step": 1001, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:19.112144", "step": 1001, "epoch": 2 }, { "type": "loss", "content": 0.0008739094482734799, "timestamp": "2025-09-30 22:07:19.116564", "step": 1002, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:19.150748", "step": 1002, "epoch": 2 }, { "type": "loss", "content": 0.00011316355085000396, "timestamp": "2025-09-30 22:07:19.152969", "step": 1003, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:19.188394", "step": 1003, "epoch": 2 }, { "type": "loss", "content": 0.0005285521619953215, "timestamp": "2025-09-30 22:07:19.212056", "step": 1004, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:19.247276", "step": 1004, "epoch": 2 }, { "type": "loss", "content": 0.00322272558696568, "timestamp": "2025-09-30 22:07:19.251941", "step": 1005, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:19.289381", "step": 1005, "epoch": 2 }, { "type": "loss", "content": 0.01631017215549946, "timestamp": "2025-09-30 22:07:19.296423", "step": 1006, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:19.333402", "step": 1006, "epoch": 2 }, { "type": "loss", "content": 0.000126895189168863, "timestamp": "2025-09-30 22:07:19.340347", "step": 1007, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:19.375598", "step": 1007, "epoch": 2 }, { "type": "loss", "content": 0.0014090503100305796, "timestamp": "2025-09-30 22:07:19.399080", "step": 1008, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:19.434942", "step": 1008, "epoch": 2 }, { "type": "loss", "content": 0.006856707390397787, "timestamp": "2025-09-30 22:07:19.440215", "step": 1009, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:19.480328", "step": 1009, "epoch": 2 }, { "type": "loss", "content": 0.008845113217830658, "timestamp": "2025-09-30 22:07:19.483602", "step": 1010, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:19.519136", "step": 1010, "epoch": 2 }, { "type": "loss", "content": 0.012483632192015648, "timestamp": "2025-09-30 22:07:19.521797", "step": 1011, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:19.558005", "step": 1011, "epoch": 2 }, { "type": "loss", "content": 0.0018354732310399413, "timestamp": "2025-09-30 22:07:19.582291", "step": 1012, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:19.621414", "step": 1012, "epoch": 2 }, { "type": "loss", "content": 0.0015046221669763327, "timestamp": "2025-09-30 22:07:19.626137", "step": 1013, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:19.659456", "step": 1013, "epoch": 2 }, { "type": "loss", "content": 0.0003278079384472221, "timestamp": "2025-09-30 22:07:19.662253", "step": 1014, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:20.510339", "step": 1014, "epoch": 2 }, { "type": "pplx", "content": 121440267.68757698, "timestamp": "2025-09-30 22:07:20.512317", "step": 1014, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:20.552632", "step": 1014, "epoch": 2 }, { "type": "loss", "content": 0.02951066754758358, "timestamp": "2025-09-30 22:07:20.554813", "step": 1015, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:20.600361", "step": 1015, "epoch": 2 }, { "type": "loss", "content": 0.0009736703941598535, "timestamp": "2025-09-30 22:07:20.628644", "step": 1016, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:20.664907", "step": 1016, "epoch": 2 }, { "type": "loss", "content": 0.0012555711437016726, "timestamp": "2025-09-30 22:07:20.666944", "step": 1017, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:20.700237", "step": 1017, "epoch": 2 }, { "type": "loss", "content": 0.0004989461740478873, "timestamp": "2025-09-30 22:07:20.704567", "step": 1018, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:07:20.738301", "step": 1018, "epoch": 2 }, { "type": "loss", "content": 0.0001337010762654245, "timestamp": "2025-09-30 22:07:20.748590", "step": 1019, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:20.783354", "step": 1019, "epoch": 2 }, { "type": "loss", "content": 0.0010935006430372596, "timestamp": "2025-09-30 22:07:20.806760", "step": 1020, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:20.851261", "step": 1020, "epoch": 2 }, { "type": "loss", "content": 0.03073728457093239, "timestamp": "2025-09-30 22:07:20.853710", "step": 1021, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:20.891776", "step": 1021, "epoch": 2 }, { "type": "loss", "content": 0.0001944132527569309, "timestamp": "2025-09-30 22:07:20.899496", "step": 1022, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:20.933172", "step": 1022, "epoch": 2 }, { "type": "loss", "content": 0.0009663135861046612, "timestamp": "2025-09-30 22:07:20.938619", "step": 1023, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:20.971818", "step": 1023, "epoch": 2 }, { "type": "loss", "content": 0.024121267721056938, "timestamp": "2025-09-30 22:07:20.995407", "step": 1024, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:21.045645", "step": 1024, "epoch": 2 }, { "type": "loss", "content": 0.002432918641716242, "timestamp": "2025-09-30 22:07:21.051039", "step": 1025, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:21.100559", "step": 1025, "epoch": 2 }, { "type": "loss", "content": 0.00709154549986124, "timestamp": "2025-09-30 22:07:21.108172", "step": 1026, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:21.142739", "step": 1026, "epoch": 2 }, { "type": "loss", "content": 0.00028265800210647285, "timestamp": "2025-09-30 22:07:21.144826", "step": 1027, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:07:21.179158", "step": 1027, "epoch": 2 }, { "type": "loss", "content": 0.004591092932969332, "timestamp": "2025-09-30 22:07:21.210372", "step": 1028, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:21.249460", "step": 1028, "epoch": 2 }, { "type": "loss", "content": 0.0005322260549291968, "timestamp": "2025-09-30 22:07:21.251782", "step": 1029, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:21.290982", "step": 1029, "epoch": 2 }, { "type": "loss", "content": 0.012618852779269218, "timestamp": "2025-09-30 22:07:21.298793", "step": 1030, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:21.333382", "step": 1030, "epoch": 2 }, { "type": "loss", "content": 0.0010076664621010423, "timestamp": "2025-09-30 22:07:21.335575", "step": 1031, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:21.373691", "step": 1031, "epoch": 2 }, { "type": "loss", "content": 0.00020813469018321484, "timestamp": "2025-09-30 22:07:21.397263", "step": 1032, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:21.430997", "step": 1032, "epoch": 2 }, { "type": "loss", "content": 0.0012870494974777102, "timestamp": "2025-09-30 22:07:21.433021", "step": 1033, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:21.468487", "step": 1033, "epoch": 2 }, { "type": "loss", "content": 0.0004765233024954796, "timestamp": "2025-09-30 22:07:21.474005", "step": 1034, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:21.511431", "step": 1034, "epoch": 2 }, { "type": "loss", "content": 0.00030462947324849665, "timestamp": "2025-09-30 22:07:21.513565", "step": 1035, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:21.548095", "step": 1035, "epoch": 2 }, { "type": "loss", "content": 0.001975154737010598, "timestamp": "2025-09-30 22:07:21.571628", "step": 1036, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:21.606500", "step": 1036, "epoch": 2 }, { "type": "loss", "content": 0.00012028579658363014, "timestamp": "2025-09-30 22:07:21.608830", "step": 1037, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:21.665252", "step": 1037, "epoch": 2 }, { "type": "loss", "content": 0.0015023910673335195, "timestamp": "2025-09-30 22:07:21.672172", "step": 1038, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:21.719265", "step": 1038, "epoch": 2 }, { "type": "loss", "content": 0.001884570112451911, "timestamp": "2025-09-30 22:07:21.721969", "step": 1039, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:21.753944", "step": 1039, "epoch": 2 }, { "type": "loss", "content": 0.01647031679749489, "timestamp": "2025-09-30 22:07:21.782545", "step": 1040, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:21.821889", "step": 1040, "epoch": 2 }, { "type": "loss", "content": 0.004868580959737301, "timestamp": "2025-09-30 22:07:21.823987", "step": 1041, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 336 ], "flops": 9966940982208 }, "timestamp": "2025-09-30 22:07:21.866728", "step": 1041, "epoch": 2 }, { "type": "loss", "content": 0.00018905648903455585, "timestamp": "2025-09-30 22:07:21.880188", "step": 1042, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:21.920644", "step": 1042, "epoch": 2 }, { "type": "loss", "content": 0.005073660518974066, "timestamp": "2025-09-30 22:07:21.923095", "step": 1043, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:21.955870", "step": 1043, "epoch": 2 }, { "type": "loss", "content": 0.0005219231243245304, "timestamp": "2025-09-30 22:07:21.979452", "step": 1044, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:22.014807", "step": 1044, "epoch": 2 }, { "type": "loss", "content": 0.0013824685011059046, "timestamp": "2025-09-30 22:07:22.017015", "step": 1045, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:22.051195", "step": 1045, "epoch": 2 }, { "type": "loss", "content": 0.008890213444828987, "timestamp": "2025-09-30 22:07:22.058860", "step": 1046, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:22.105804", "step": 1046, "epoch": 2 }, { "type": "loss", "content": 0.00017873049364425242, "timestamp": "2025-09-30 22:07:22.107894", "step": 1047, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:22.147997", "step": 1047, "epoch": 2 }, { "type": "loss", "content": 0.00027590824174694717, "timestamp": "2025-09-30 22:07:22.172286", "step": 1048, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:22.214891", "step": 1048, "epoch": 2 }, { "type": "loss", "content": 0.0001358627196168527, "timestamp": "2025-09-30 22:07:22.216878", "step": 1049, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:22.250266", "step": 1049, "epoch": 2 }, { "type": "loss", "content": 0.0011405263794586062, "timestamp": "2025-09-30 22:07:22.254896", "step": 1050, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:22.287435", "step": 1050, "epoch": 2 }, { "type": "loss", "content": 0.0006561552872881293, "timestamp": "2025-09-30 22:07:22.289500", "step": 1051, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:22.322065", "step": 1051, "epoch": 2 }, { "type": "loss", "content": 0.003038618015125394, "timestamp": "2025-09-30 22:07:22.345655", "step": 1052, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:22.387636", "step": 1052, "epoch": 2 }, { "type": "loss", "content": 0.004722389858216047, "timestamp": "2025-09-30 22:07:22.389633", "step": 1053, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:23.247533", "step": 1053, "epoch": 2 }, { "type": "pplx", "content": 117074198.67895347, "timestamp": "2025-09-30 22:07:23.249615", "step": 1053, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:23.284013", "step": 1053, "epoch": 2 }, { "type": "loss", "content": 0.0001479993516113609, "timestamp": "2025-09-30 22:07:23.291544", "step": 1054, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:23.324865", "step": 1054, "epoch": 2 }, { "type": "loss", "content": 0.005029443185776472, "timestamp": "2025-09-30 22:07:23.326902", "step": 1055, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:23.367102", "step": 1055, "epoch": 2 }, { "type": "loss", "content": 0.0011753159342333674, "timestamp": "2025-09-30 22:07:23.394932", "step": 1056, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:23.426882", "step": 1056, "epoch": 2 }, { "type": "loss", "content": 0.006908381823450327, "timestamp": "2025-09-30 22:07:23.429042", "step": 1057, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:23.462639", "step": 1057, "epoch": 2 }, { "type": "loss", "content": 0.00392345292493701, "timestamp": "2025-09-30 22:07:23.465057", "step": 1058, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:23.497575", "step": 1058, "epoch": 2 }, { "type": "loss", "content": 0.00748575059697032, "timestamp": "2025-09-30 22:07:23.505539", "step": 1059, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:23.545877", "step": 1059, "epoch": 2 }, { "type": "loss", "content": 0.0001961512753041461, "timestamp": "2025-09-30 22:07:23.569687", "step": 1060, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:23.610728", "step": 1060, "epoch": 2 }, { "type": "loss", "content": 0.0005363413947634399, "timestamp": "2025-09-30 22:07:23.616178", "step": 1061, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:23.649617", "step": 1061, "epoch": 2 }, { "type": "loss", "content": 0.00021425398881547153, "timestamp": "2025-09-30 22:07:23.651795", "step": 1062, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:23.685918", "step": 1062, "epoch": 2 }, { "type": "loss", "content": 0.016600722447037697, "timestamp": "2025-09-30 22:07:23.688807", "step": 1063, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:23.724286", "step": 1063, "epoch": 2 }, { "type": "loss", "content": 0.0002527764590922743, "timestamp": "2025-09-30 22:07:23.752278", "step": 1064, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:23.788709", "step": 1064, "epoch": 2 }, { "type": "loss", "content": 0.019825953990221024, "timestamp": "2025-09-30 22:07:23.790782", "step": 1065, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:23.825375", "step": 1065, "epoch": 2 }, { "type": "loss", "content": 0.00023776039597578347, "timestamp": "2025-09-30 22:07:23.833317", "step": 1066, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:23.867092", "step": 1066, "epoch": 2 }, { "type": "loss", "content": 7.995210035005584e-05, "timestamp": "2025-09-30 22:07:23.869672", "step": 1067, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:23.906103", "step": 1067, "epoch": 2 }, { "type": "loss", "content": 5.865958155482076e-05, "timestamp": "2025-09-30 22:07:23.929777", "step": 1068, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:23.965857", "step": 1068, "epoch": 2 }, { "type": "loss", "content": 0.00011427756544435397, "timestamp": "2025-09-30 22:07:23.967844", "step": 1069, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:24.008524", "step": 1069, "epoch": 2 }, { "type": "loss", "content": 0.00019449848332442343, "timestamp": "2025-09-30 22:07:24.011372", "step": 1070, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:24.047641", "step": 1070, "epoch": 2 }, { "type": "loss", "content": 8.977264951681718e-05, "timestamp": "2025-09-30 22:07:24.049571", "step": 1071, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:24.083353", "step": 1071, "epoch": 2 }, { "type": "loss", "content": 0.006148288957774639, "timestamp": "2025-09-30 22:07:24.107018", "step": 1072, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:24.141017", "step": 1072, "epoch": 2 }, { "type": "loss", "content": 0.0018119210144504905, "timestamp": "2025-09-30 22:07:24.143432", "step": 1073, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:24.178422", "step": 1073, "epoch": 2 }, { "type": "loss", "content": 0.00039394685882143676, "timestamp": "2025-09-30 22:07:24.181312", "step": 1074, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:24.217732", "step": 1074, "epoch": 2 }, { "type": "loss", "content": 0.030289221554994583, "timestamp": "2025-09-30 22:07:24.220222", "step": 1075, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:24.254775", "step": 1075, "epoch": 2 }, { "type": "loss", "content": 0.003143486799672246, "timestamp": "2025-09-30 22:07:24.280327", "step": 1076, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:24.315761", "step": 1076, "epoch": 2 }, { "type": "loss", "content": 0.006031307391822338, "timestamp": "2025-09-30 22:07:24.317867", "step": 1077, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:24.351722", "step": 1077, "epoch": 2 }, { "type": "loss", "content": 0.0009177754982374609, "timestamp": "2025-09-30 22:07:24.355987", "step": 1078, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:24.391929", "step": 1078, "epoch": 2 }, { "type": "loss", "content": 0.0003655412292573601, "timestamp": "2025-09-30 22:07:24.399251", "step": 1079, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:07:24.442941", "step": 1079, "epoch": 2 }, { "type": "loss", "content": 7.012840069364756e-05, "timestamp": "2025-09-30 22:07:24.474362", "step": 1080, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:24.517405", "step": 1080, "epoch": 2 }, { "type": "loss", "content": 0.001427992945536971, "timestamp": "2025-09-30 22:07:24.522347", "step": 1081, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:24.571105", "step": 1081, "epoch": 2 }, { "type": "loss", "content": 0.0016603464027866721, "timestamp": "2025-09-30 22:07:24.573000", "step": 1082, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:24.620007", "step": 1082, "epoch": 2 }, { "type": "loss", "content": 0.00029763669590465724, "timestamp": "2025-09-30 22:07:24.622427", "step": 1083, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:24.663728", "step": 1083, "epoch": 2 }, { "type": "loss", "content": 0.017944645136594772, "timestamp": "2025-09-30 22:07:24.687249", "step": 1084, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:24.726871", "step": 1084, "epoch": 2 }, { "type": "loss", "content": 0.024879327043890953, "timestamp": "2025-09-30 22:07:24.728726", "step": 1085, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:24.775816", "step": 1085, "epoch": 2 }, { "type": "loss", "content": 6.769696483388543e-05, "timestamp": "2025-09-30 22:07:24.777895", "step": 1086, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:24.817685", "step": 1086, "epoch": 2 }, { "type": "loss", "content": 0.000262332905549556, "timestamp": "2025-09-30 22:07:24.820018", "step": 1087, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:24.855976", "step": 1087, "epoch": 2 }, { "type": "loss", "content": 0.0017855723854154348, "timestamp": "2025-09-30 22:07:24.881476", "step": 1088, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:24.923211", "step": 1088, "epoch": 2 }, { "type": "loss", "content": 0.001035021268762648, "timestamp": "2025-09-30 22:07:24.925463", "step": 1089, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:24.967707", "step": 1089, "epoch": 2 }, { "type": "loss", "content": 0.00034603956737555563, "timestamp": "2025-09-30 22:07:24.970499", "step": 1090, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:25.017243", "step": 1090, "epoch": 2 }, { "type": "loss", "content": 0.0006762367556802928, "timestamp": "2025-09-30 22:07:25.019681", "step": 1091, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:25.054496", "step": 1091, "epoch": 2 }, { "type": "loss", "content": 0.00011217459541512653, "timestamp": "2025-09-30 22:07:25.078273", "step": 1092, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:25.942337", "step": 1092, "epoch": 2 }, { "type": "pplx", "content": 114812072.89414735, "timestamp": "2025-09-30 22:07:25.944231", "step": 1092, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:25.982267", "step": 1092, "epoch": 2 }, { "type": "loss", "content": 0.00041928747668862343, "timestamp": "2025-09-30 22:07:25.984388", "step": 1093, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:26.026509", "step": 1093, "epoch": 2 }, { "type": "loss", "content": 0.00014250872482080013, "timestamp": "2025-09-30 22:07:26.028673", "step": 1094, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:26.062588", "step": 1094, "epoch": 2 }, { "type": "loss", "content": 0.020229920744895935, "timestamp": "2025-09-30 22:07:26.067197", "step": 1095, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:26.110414", "step": 1095, "epoch": 2 }, { "type": "loss", "content": 0.0002887416922021657, "timestamp": "2025-09-30 22:07:26.138645", "step": 1096, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:26.171453", "step": 1096, "epoch": 2 }, { "type": "loss", "content": 0.00011543335858732462, "timestamp": "2025-09-30 22:07:26.173391", "step": 1097, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:26.211958", "step": 1097, "epoch": 2 }, { "type": "loss", "content": 0.0009078416624106467, "timestamp": "2025-09-30 22:07:26.216542", "step": 1098, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:26.252557", "step": 1098, "epoch": 2 }, { "type": "loss", "content": 0.0011346840765327215, "timestamp": "2025-09-30 22:07:26.259679", "step": 1099, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:26.303161", "step": 1099, "epoch": 2 }, { "type": "loss", "content": 0.02301344834268093, "timestamp": "2025-09-30 22:07:26.326868", "step": 1100, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:26.373972", "step": 1100, "epoch": 2 }, { "type": "loss", "content": 0.0005447498406283557, "timestamp": "2025-09-30 22:07:26.376021", "step": 1101, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:26.423088", "step": 1101, "epoch": 2 }, { "type": "loss", "content": 0.002310013398528099, "timestamp": "2025-09-30 22:07:26.427404", "step": 1102, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:26.466821", "step": 1102, "epoch": 2 }, { "type": "loss", "content": 0.0007707496988587081, "timestamp": "2025-09-30 22:07:26.468888", "step": 1103, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:26.509087", "step": 1103, "epoch": 2 }, { "type": "loss", "content": 0.003529587760567665, "timestamp": "2025-09-30 22:07:26.533027", "step": 1104, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:26.566816", "step": 1104, "epoch": 2 }, { "type": "loss", "content": 0.0007036178722046316, "timestamp": "2025-09-30 22:07:26.569427", "step": 1105, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:26.604263", "step": 1105, "epoch": 2 }, { "type": "loss", "content": 0.002396752592176199, "timestamp": "2025-09-30 22:07:26.608622", "step": 1106, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:26.641009", "step": 1106, "epoch": 2 }, { "type": "loss", "content": 0.0011795532191172242, "timestamp": "2025-09-30 22:07:26.643490", "step": 1107, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:07:26.687701", "step": 1107, "epoch": 2 }, { "type": "loss", "content": 0.005366952158510685, "timestamp": "2025-09-30 22:07:26.719029", "step": 1108, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:26.763493", "step": 1108, "epoch": 2 }, { "type": "loss", "content": 0.007472001016139984, "timestamp": "2025-09-30 22:07:26.765459", "step": 1109, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:26.816703", "step": 1109, "epoch": 2 }, { "type": "loss", "content": 0.005245849024504423, "timestamp": "2025-09-30 22:07:26.819660", "step": 1110, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:26.852233", "step": 1110, "epoch": 2 }, { "type": "loss", "content": 0.001431244076229632, "timestamp": "2025-09-30 22:07:26.854268", "step": 1111, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:26.897266", "step": 1111, "epoch": 2 }, { "type": "loss", "content": 0.00028347785701043904, "timestamp": "2025-09-30 22:07:26.920936", "step": 1112, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:26.964343", "step": 1112, "epoch": 2 }, { "type": "loss", "content": 0.0003852161462418735, "timestamp": "2025-09-30 22:07:26.966426", "step": 1113, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:26.999290", "step": 1113, "epoch": 2 }, { "type": "loss", "content": 0.00235289940610528, "timestamp": "2025-09-30 22:07:27.001433", "step": 1114, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:27.034182", "step": 1114, "epoch": 2 }, { "type": "loss", "content": 0.000366671709343791, "timestamp": "2025-09-30 22:07:27.036369", "step": 1115, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:27.081274", "step": 1115, "epoch": 2 }, { "type": "loss", "content": 0.00013203351409174502, "timestamp": "2025-09-30 22:07:27.110286", "step": 1116, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:27.165920", "step": 1116, "epoch": 2 }, { "type": "loss", "content": 0.0011101525742560625, "timestamp": "2025-09-30 22:07:27.167921", "step": 1117, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:27.199904", "step": 1117, "epoch": 2 }, { "type": "loss", "content": 0.005109868943691254, "timestamp": "2025-09-30 22:07:27.202093", "step": 1118, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:27.234411", "step": 1118, "epoch": 2 }, { "type": "loss", "content": 0.00020523167040664703, "timestamp": "2025-09-30 22:07:27.239013", "step": 1119, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:27.280895", "step": 1119, "epoch": 2 }, { "type": "loss", "content": 0.015378288924694061, "timestamp": "2025-09-30 22:07:27.304752", "step": 1120, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:27.340057", "step": 1120, "epoch": 2 }, { "type": "loss", "content": 0.0002846012939698994, "timestamp": "2025-09-30 22:07:27.345741", "step": 1121, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:27.383913", "step": 1121, "epoch": 2 }, { "type": "loss", "content": 0.0018362916307523847, "timestamp": "2025-09-30 22:07:27.386049", "step": 1122, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:27.435373", "step": 1122, "epoch": 2 }, { "type": "loss", "content": 0.004865389317274094, "timestamp": "2025-09-30 22:07:27.442458", "step": 1123, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:27.486170", "step": 1123, "epoch": 2 }, { "type": "loss", "content": 0.022158129140734673, "timestamp": "2025-09-30 22:07:27.509829", "step": 1124, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:27.551375", "step": 1124, "epoch": 2 }, { "type": "loss", "content": 0.00012032425729557872, "timestamp": "2025-09-30 22:07:27.553428", "step": 1125, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-30 22:07:27.601023", "step": 1125, "epoch": 2 }, { "type": "loss", "content": 0.0012263988610357046, "timestamp": "2025-09-30 22:07:27.613415", "step": 1126, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:27.663420", "step": 1126, "epoch": 2 }, { "type": "loss", "content": 8.227315993281081e-05, "timestamp": "2025-09-30 22:07:27.667792", "step": 1127, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:27.710674", "step": 1127, "epoch": 2 }, { "type": "loss", "content": 0.004805626813322306, "timestamp": "2025-09-30 22:07:27.736004", "step": 1128, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:27.783760", "step": 1128, "epoch": 2 }, { "type": "loss", "content": 0.039238281548023224, "timestamp": "2025-09-30 22:07:27.785901", "step": 1129, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:27.835489", "step": 1129, "epoch": 2 }, { "type": "loss", "content": 0.00029851426370441914, "timestamp": "2025-09-30 22:07:27.843225", "step": 1130, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:27.879179", "step": 1130, "epoch": 2 }, { "type": "loss", "content": 0.00048508719191886485, "timestamp": "2025-09-30 22:07:27.881252", "step": 1131, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:28.696294", "step": 1131, "epoch": 2 }, { "type": "pplx", "content": 111057330.30651917, "timestamp": "2025-09-30 22:07:28.698984", "step": 1131, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:28.728636", "step": 1131, "epoch": 2 }, { "type": "loss", "content": 0.00038018793566152453, "timestamp": "2025-09-30 22:07:28.752198", "step": 1132, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:28.795951", "step": 1132, "epoch": 2 }, { "type": "loss", "content": 0.00018479253049008548, "timestamp": "2025-09-30 22:07:28.800746", "step": 1133, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:28.836099", "step": 1133, "epoch": 2 }, { "type": "loss", "content": 0.00030270571005530655, "timestamp": "2025-09-30 22:07:28.838307", "step": 1134, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:28.876430", "step": 1134, "epoch": 2 }, { "type": "loss", "content": 0.00044822480413131416, "timestamp": "2025-09-30 22:07:28.881175", "step": 1135, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:28.915107", "step": 1135, "epoch": 2 }, { "type": "loss", "content": 0.004301538225263357, "timestamp": "2025-09-30 22:07:28.944253", "step": 1136, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:28.997685", "step": 1136, "epoch": 2 }, { "type": "loss", "content": 0.0001317481219302863, "timestamp": "2025-09-30 22:07:29.000768", "step": 1137, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:29.049135", "step": 1137, "epoch": 2 }, { "type": "loss", "content": 0.002347787842154503, "timestamp": "2025-09-30 22:07:29.056453", "step": 1138, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 80 ], "flops": 2373281365952 }, "timestamp": "2025-09-30 22:07:29.098652", "step": 1138, "epoch": 2 }, { "type": "loss", "content": 0.0011587826302275062, "timestamp": "2025-09-30 22:07:29.102108", "step": 1139, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:29.136199", "step": 1139, "epoch": 2 }, { "type": "loss", "content": 0.0026315338909626007, "timestamp": "2025-09-30 22:07:29.161088", "step": 1140, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:29.194963", "step": 1140, "epoch": 2 }, { "type": "loss", "content": 0.0005633877008222044, "timestamp": "2025-09-30 22:07:29.199920", "step": 1141, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:29.232078", "step": 1141, "epoch": 2 }, { "type": "loss", "content": 0.006881002802401781, "timestamp": "2025-09-30 22:07:29.236618", "step": 1142, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:29.277385", "step": 1142, "epoch": 2 }, { "type": "loss", "content": 0.0006113244453445077, "timestamp": "2025-09-30 22:07:29.280123", "step": 1143, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:29.313899", "step": 1143, "epoch": 2 }, { "type": "loss", "content": 0.01047942228615284, "timestamp": "2025-09-30 22:07:29.337920", "step": 1144, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:29.376061", "step": 1144, "epoch": 2 }, { "type": "loss", "content": 0.0009232796146534383, "timestamp": "2025-09-30 22:07:29.378580", "step": 1145, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:29.413447", "step": 1145, "epoch": 2 }, { "type": "loss", "content": 0.0028549160342663527, "timestamp": "2025-09-30 22:07:29.418173", "step": 1146, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:29.459570", "step": 1146, "epoch": 2 }, { "type": "loss", "content": 0.00026438047643750906, "timestamp": "2025-09-30 22:07:29.466695", "step": 1147, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:29.516982", "step": 1147, "epoch": 2 }, { "type": "loss", "content": 0.006288810167461634, "timestamp": "2025-09-30 22:07:29.541140", "step": 1148, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:29.578736", "step": 1148, "epoch": 2 }, { "type": "loss", "content": 0.009455591440200806, "timestamp": "2025-09-30 22:07:29.581409", "step": 1149, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:29.618934", "step": 1149, "epoch": 2 }, { "type": "loss", "content": 0.003386803437024355, "timestamp": "2025-09-30 22:07:29.626128", "step": 1150, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:29.664128", "step": 1150, "epoch": 2 }, { "type": "loss", "content": 0.0006055928533896804, "timestamp": "2025-09-30 22:07:29.671155", "step": 1151, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:29.712255", "step": 1151, "epoch": 2 }, { "type": "loss", "content": 0.038205623626708984, "timestamp": "2025-09-30 22:07:29.736319", "step": 1152, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:29.770374", "step": 1152, "epoch": 2 }, { "type": "loss", "content": 0.003251886460930109, "timestamp": "2025-09-30 22:07:29.773048", "step": 1153, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:29.808933", "step": 1153, "epoch": 2 }, { "type": "loss", "content": 0.00016367484931834042, "timestamp": "2025-09-30 22:07:29.811337", "step": 1154, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:29.844482", "step": 1154, "epoch": 2 }, { "type": "loss", "content": 7.107849523890764e-05, "timestamp": "2025-09-30 22:07:29.849187", "step": 1155, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:29.883924", "step": 1155, "epoch": 2 }, { "type": "loss", "content": 0.002373484428972006, "timestamp": "2025-09-30 22:07:29.908377", "step": 1156, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:29.950141", "step": 1156, "epoch": 2 }, { "type": "loss", "content": 0.00015534063277300447, "timestamp": "2025-09-30 22:07:29.952180", "step": 1157, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:29.989428", "step": 1157, "epoch": 2 }, { "type": "loss", "content": 0.00014958075189497322, "timestamp": "2025-09-30 22:07:29.992165", "step": 1158, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:30.027404", "step": 1158, "epoch": 2 }, { "type": "loss", "content": 0.001462747110053897, "timestamp": "2025-09-30 22:07:30.031747", "step": 1159, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:30.065053", "step": 1159, "epoch": 2 }, { "type": "loss", "content": 0.0019475112203508615, "timestamp": "2025-09-30 22:07:30.089231", "step": 1160, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:30.124473", "step": 1160, "epoch": 2 }, { "type": "loss", "content": 0.01143405307084322, "timestamp": "2025-09-30 22:07:30.127259", "step": 1161, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:30.162715", "step": 1161, "epoch": 2 }, { "type": "loss", "content": 0.02949167788028717, "timestamp": "2025-09-30 22:07:30.169782", "step": 1162, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:30.206509", "step": 1162, "epoch": 2 }, { "type": "loss", "content": 0.0006551267579197884, "timestamp": "2025-09-30 22:07:30.210897", "step": 1163, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:30.245757", "step": 1163, "epoch": 2 }, { "type": "loss", "content": 0.025250211358070374, "timestamp": "2025-09-30 22:07:30.273860", "step": 1164, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:30.307190", "step": 1164, "epoch": 2 }, { "type": "loss", "content": 0.0009507841314189136, "timestamp": "2025-09-30 22:07:30.311036", "step": 1165, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:30.344818", "step": 1165, "epoch": 2 }, { "type": "loss", "content": 0.01949252560734749, "timestamp": "2025-09-30 22:07:30.349033", "step": 1166, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:30.384127", "step": 1166, "epoch": 2 }, { "type": "loss", "content": 0.0043541546911001205, "timestamp": "2025-09-30 22:07:30.387046", "step": 1167, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:30.422103", "step": 1167, "epoch": 2 }, { "type": "loss", "content": 0.00019489186524879187, "timestamp": "2025-09-30 22:07:30.446610", "step": 1168, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:30.479821", "step": 1168, "epoch": 2 }, { "type": "loss", "content": 0.00754409609362483, "timestamp": "2025-09-30 22:07:30.482482", "step": 1169, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:30.516951", "step": 1169, "epoch": 2 }, { "type": "loss", "content": 0.00020673531980719417, "timestamp": "2025-09-30 22:07:30.524294", "step": 1170, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:31.226245", "step": 1170, "epoch": 2 }, { "type": "pplx", "content": 107393841.23209043, "timestamp": "2025-09-30 22:07:31.228308", "step": 1170, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:31.258468", "step": 1170, "epoch": 2 }, { "type": "loss", "content": 0.004547130316495895, "timestamp": "2025-09-30 22:07:31.260654", "step": 1171, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:31.314875", "step": 1171, "epoch": 2 }, { "type": "loss", "content": 8.820913353702053e-05, "timestamp": "2025-09-30 22:07:31.338414", "step": 1172, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:31.372133", "step": 1172, "epoch": 2 }, { "type": "loss", "content": 0.0012921736342832446, "timestamp": "2025-09-30 22:07:31.374214", "step": 1173, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:31.412384", "step": 1173, "epoch": 2 }, { "type": "loss", "content": 0.00023375029559247196, "timestamp": "2025-09-30 22:07:31.414570", "step": 1174, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:31.448147", "step": 1174, "epoch": 2 }, { "type": "loss", "content": 0.0003004588943440467, "timestamp": "2025-09-30 22:07:31.455362", "step": 1175, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:31.487937", "step": 1175, "epoch": 2 }, { "type": "loss", "content": 0.00036343271494843066, "timestamp": "2025-09-30 22:07:31.511390", "step": 1176, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:31.550605", "step": 1176, "epoch": 2 }, { "type": "loss", "content": 0.000501815986353904, "timestamp": "2025-09-30 22:07:31.552786", "step": 1177, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:31.585625", "step": 1177, "epoch": 2 }, { "type": "loss", "content": 0.00023165717720985413, "timestamp": "2025-09-30 22:07:31.587783", "step": 1178, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:31.622101", "step": 1178, "epoch": 2 }, { "type": "loss", "content": 0.0015713156899437308, "timestamp": "2025-09-30 22:07:31.626934", "step": 1179, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:31.659455", "step": 1179, "epoch": 2 }, { "type": "loss", "content": 0.0007136641070246696, "timestamp": "2025-09-30 22:07:31.684955", "step": 1180, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:31.723796", "step": 1180, "epoch": 2 }, { "type": "loss", "content": 0.0010480453493073583, "timestamp": "2025-09-30 22:07:31.726094", "step": 1181, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:31.758210", "step": 1181, "epoch": 2 }, { "type": "loss", "content": 0.002461208263412118, "timestamp": "2025-09-30 22:07:31.760498", "step": 1182, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:31.799843", "step": 1182, "epoch": 2 }, { "type": "loss", "content": 0.0006307312869466841, "timestamp": "2025-09-30 22:07:31.804128", "step": 1183, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:31.836251", "step": 1183, "epoch": 2 }, { "type": "loss", "content": 0.0055591752752661705, "timestamp": "2025-09-30 22:07:31.864519", "step": 1184, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:31.903849", "step": 1184, "epoch": 2 }, { "type": "loss", "content": 0.0013035887386649847, "timestamp": "2025-09-30 22:07:31.908539", "step": 1185, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:31.941569", "step": 1185, "epoch": 2 }, { "type": "loss", "content": 0.0003693876205943525, "timestamp": "2025-09-30 22:07:31.943777", "step": 1186, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:31.977278", "step": 1186, "epoch": 2 }, { "type": "loss", "content": 0.0008305688970722258, "timestamp": "2025-09-30 22:07:31.982309", "step": 1187, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:32.015357", "step": 1187, "epoch": 2 }, { "type": "loss", "content": 0.0003829421184491366, "timestamp": "2025-09-30 22:07:32.043487", "step": 1188, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:32.077283", "step": 1188, "epoch": 2 }, { "type": "loss", "content": 0.00402789656072855, "timestamp": "2025-09-30 22:07:32.079679", "step": 1189, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:32.112944", "step": 1189, "epoch": 2 }, { "type": "loss", "content": 0.00023092723859008402, "timestamp": "2025-09-30 22:07:32.115035", "step": 1190, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:32.148948", "step": 1190, "epoch": 2 }, { "type": "loss", "content": 0.0004790943639818579, "timestamp": "2025-09-30 22:07:32.156001", "step": 1191, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:32.194346", "step": 1191, "epoch": 2 }, { "type": "loss", "content": 0.00016945795505307615, "timestamp": "2025-09-30 22:07:32.217962", "step": 1192, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:32.259407", "step": 1192, "epoch": 2 }, { "type": "loss", "content": 0.001122702145949006, "timestamp": "2025-09-30 22:07:32.261909", "step": 1193, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:32.294564", "step": 1193, "epoch": 2 }, { "type": "loss", "content": 0.00017199316062033176, "timestamp": "2025-09-30 22:07:32.301669", "step": 1194, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:32.336916", "step": 1194, "epoch": 2 }, { "type": "loss", "content": 0.0003308496088720858, "timestamp": "2025-09-30 22:07:32.341319", "step": 1195, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:32.373387", "step": 1195, "epoch": 2 }, { "type": "loss", "content": 0.001034219516441226, "timestamp": "2025-09-30 22:07:32.396897", "step": 1196, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:32.431249", "step": 1196, "epoch": 2 }, { "type": "loss", "content": 0.000114070026029367, "timestamp": "2025-09-30 22:07:32.435927", "step": 1197, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:32.479303", "step": 1197, "epoch": 2 }, { "type": "loss", "content": 0.0019510245183482766, "timestamp": "2025-09-30 22:07:32.487092", "step": 1198, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:32.519501", "step": 1198, "epoch": 2 }, { "type": "loss", "content": 0.00010530307190492749, "timestamp": "2025-09-30 22:07:32.521706", "step": 1199, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:32.557335", "step": 1199, "epoch": 2 }, { "type": "loss", "content": 0.00038648530608043075, "timestamp": "2025-09-30 22:07:32.580927", "step": 1200, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:32.622607", "step": 1200, "epoch": 2 }, { "type": "loss", "content": 0.0008311023120768368, "timestamp": "2025-09-30 22:07:32.624938", "step": 1201, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:32.664466", "step": 1201, "epoch": 2 }, { "type": "loss", "content": 0.005615743342787027, "timestamp": "2025-09-30 22:07:32.671706", "step": 1202, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:32.706151", "step": 1202, "epoch": 2 }, { "type": "loss", "content": 0.0002661571779754013, "timestamp": "2025-09-30 22:07:32.710377", "step": 1203, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:32.743688", "step": 1203, "epoch": 2 }, { "type": "loss", "content": 0.0001349939702777192, "timestamp": "2025-09-30 22:07:32.769416", "step": 1204, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:32.811244", "step": 1204, "epoch": 2 }, { "type": "loss", "content": 0.00019696114759426564, "timestamp": "2025-09-30 22:07:32.816686", "step": 1205, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:32.851179", "step": 1205, "epoch": 2 }, { "type": "loss", "content": 0.0005079305847175419, "timestamp": "2025-09-30 22:07:32.855636", "step": 1206, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:32.891640", "step": 1206, "epoch": 2 }, { "type": "loss", "content": 0.0006546974182128906, "timestamp": "2025-09-30 22:07:32.896010", "step": 1207, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:32.949034", "step": 1207, "epoch": 2 }, { "type": "loss", "content": 0.0014550471678376198, "timestamp": "2025-09-30 22:07:32.972497", "step": 1208, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:33.011042", "step": 1208, "epoch": 2 }, { "type": "loss", "content": 0.002267097821459174, "timestamp": "2025-09-30 22:07:33.013074", "step": 1209, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:33.696170", "step": 1209, "epoch": 2 }, { "type": "pplx", "content": 108422431.51281904, "timestamp": "2025-09-30 22:07:33.698353", "step": 1209, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:33.727709", "step": 1209, "epoch": 2 }, { "type": "loss", "content": 0.0005266775260679424, "timestamp": "2025-09-30 22:07:33.732299", "step": 1210, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:33.764619", "step": 1210, "epoch": 2 }, { "type": "loss", "content": 0.014525589533150196, "timestamp": "2025-09-30 22:07:33.766776", "step": 1211, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:33.807720", "step": 1211, "epoch": 2 }, { "type": "loss", "content": 0.0015358092496171594, "timestamp": "2025-09-30 22:07:33.833018", "step": 1212, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:33.870207", "step": 1212, "epoch": 2 }, { "type": "loss", "content": 0.00038857717299833894, "timestamp": "2025-09-30 22:07:33.872719", "step": 1213, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:33.908278", "step": 1213, "epoch": 2 }, { "type": "loss", "content": 0.00033371103927493095, "timestamp": "2025-09-30 22:07:33.912847", "step": 1214, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:33.946687", "step": 1214, "epoch": 2 }, { "type": "loss", "content": 7.424094656016678e-05, "timestamp": "2025-09-30 22:07:33.949561", "step": 1215, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:33.982696", "step": 1215, "epoch": 2 }, { "type": "loss", "content": 0.0003344232391100377, "timestamp": "2025-09-30 22:07:34.006596", "step": 1216, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:34.041977", "step": 1216, "epoch": 2 }, { "type": "loss", "content": 0.0001631371706025675, "timestamp": "2025-09-30 22:07:34.044266", "step": 1217, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:34.084628", "step": 1217, "epoch": 2 }, { "type": "loss", "content": 0.0026797533500939608, "timestamp": "2025-09-30 22:07:34.087412", "step": 1218, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:34.120397", "step": 1218, "epoch": 2 }, { "type": "loss", "content": 0.0005254180869087577, "timestamp": "2025-09-30 22:07:34.122157", "step": 1219, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:34.155920", "step": 1219, "epoch": 2 }, { "type": "loss", "content": 0.0001768097426975146, "timestamp": "2025-09-30 22:07:34.179731", "step": 1220, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:34.220980", "step": 1220, "epoch": 2 }, { "type": "loss", "content": 0.0001642854476813227, "timestamp": "2025-09-30 22:07:34.223159", "step": 1221, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:34.257299", "step": 1221, "epoch": 2 }, { "type": "loss", "content": 0.00038075828342698514, "timestamp": "2025-09-30 22:07:34.259443", "step": 1222, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:34.295921", "step": 1222, "epoch": 2 }, { "type": "loss", "content": 0.013797529973089695, "timestamp": "2025-09-30 22:07:34.304067", "step": 1223, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:34.336990", "step": 1223, "epoch": 2 }, { "type": "loss", "content": 0.019091341644525528, "timestamp": "2025-09-30 22:07:34.360616", "step": 1224, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:34.394341", "step": 1224, "epoch": 2 }, { "type": "loss", "content": 0.00017502431001048535, "timestamp": "2025-09-30 22:07:34.398956", "step": 1225, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:34.431879", "step": 1225, "epoch": 2 }, { "type": "loss", "content": 0.0009142042836174369, "timestamp": "2025-09-30 22:07:34.434803", "step": 1226, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:34.467132", "step": 1226, "epoch": 2 }, { "type": "loss", "content": 0.000977121526375413, "timestamp": "2025-09-30 22:07:34.471775", "step": 1227, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:34.514271", "step": 1227, "epoch": 2 }, { "type": "loss", "content": 0.0005587582127191126, "timestamp": "2025-09-30 22:07:34.538008", "step": 1228, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:34.588925", "step": 1228, "epoch": 2 }, { "type": "loss", "content": 0.001227530068717897, "timestamp": "2025-09-30 22:07:34.591044", "step": 1229, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:34.624813", "step": 1229, "epoch": 2 }, { "type": "loss", "content": 0.04056653380393982, "timestamp": "2025-09-30 22:07:34.628727", "step": 1230, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:34.668933", "step": 1230, "epoch": 2 }, { "type": "loss", "content": 0.00010975310578942299, "timestamp": "2025-09-30 22:07:34.670920", "step": 1231, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:34.704241", "step": 1231, "epoch": 2 }, { "type": "loss", "content": 0.0007277274271473289, "timestamp": "2025-09-30 22:07:34.729772", "step": 1232, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:34.762694", "step": 1232, "epoch": 2 }, { "type": "loss", "content": 0.0003397251130081713, "timestamp": "2025-09-30 22:07:34.764836", "step": 1233, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:34.798183", "step": 1233, "epoch": 2 }, { "type": "loss", "content": 0.007443286012858152, "timestamp": "2025-09-30 22:07:34.802875", "step": 1234, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:34.835821", "step": 1234, "epoch": 2 }, { "type": "loss", "content": 0.0015747037250548601, "timestamp": "2025-09-30 22:07:34.842772", "step": 1235, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:34.878376", "step": 1235, "epoch": 2 }, { "type": "loss", "content": 5.67116767342668e-05, "timestamp": "2025-09-30 22:07:34.906530", "step": 1236, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:34.941103", "step": 1236, "epoch": 2 }, { "type": "loss", "content": 0.0033072498627007008, "timestamp": "2025-09-30 22:07:34.943249", "step": 1237, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:34.976375", "step": 1237, "epoch": 2 }, { "type": "loss", "content": 0.0008226304198615253, "timestamp": "2025-09-30 22:07:34.983683", "step": 1238, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:35.016379", "step": 1238, "epoch": 2 }, { "type": "loss", "content": 0.013950667344033718, "timestamp": "2025-09-30 22:07:35.019223", "step": 1239, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:35.050865", "step": 1239, "epoch": 2 }, { "type": "loss", "content": 0.0024328443687409163, "timestamp": "2025-09-30 22:07:35.079020", "step": 1240, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:35.111767", "step": 1240, "epoch": 2 }, { "type": "loss", "content": 0.009844565764069557, "timestamp": "2025-09-30 22:07:35.114111", "step": 1241, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:35.148199", "step": 1241, "epoch": 2 }, { "type": "loss", "content": 0.0006344670546241105, "timestamp": "2025-09-30 22:07:35.152928", "step": 1242, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:35.187526", "step": 1242, "epoch": 2 }, { "type": "loss", "content": 0.0005711555131711066, "timestamp": "2025-09-30 22:07:35.191857", "step": 1243, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:35.227428", "step": 1243, "epoch": 2 }, { "type": "loss", "content": 0.008294356986880302, "timestamp": "2025-09-30 22:07:35.252810", "step": 1244, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:35.285127", "step": 1244, "epoch": 2 }, { "type": "loss", "content": 0.0008478214731439948, "timestamp": "2025-09-30 22:07:35.289820", "step": 1245, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 2, 192 ], "flops": 2847885110400 }, "timestamp": "2025-09-30 22:07:35.331568", "step": 1245, "epoch": 2 }, { "type": "loss", "content": 0.00015602679923176765, "timestamp": "2025-09-30 22:07:35.333611", "step": 1246, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:35.383504", "step": 1246, "epoch": 3 }, { "type": "loss", "content": 0.0039008252788335085, "timestamp": "2025-09-30 22:07:35.385600", "step": 1247, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:35.419217", "step": 1247, "epoch": 3 }, { "type": "loss", "content": 0.004876903258264065, "timestamp": "2025-09-30 22:07:35.443027", "step": 1248, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:36.135041", "step": 1248, "epoch": 3 }, { "type": "pplx", "content": 109807121.1574246, "timestamp": "2025-09-30 22:07:36.136996", "step": 1248, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:36.176025", "step": 1248, "epoch": 3 }, { "type": "loss", "content": 0.000255336839472875, "timestamp": "2025-09-30 22:07:36.178051", "step": 1249, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:36.212961", "step": 1249, "epoch": 3 }, { "type": "loss", "content": 0.004327817354351282, "timestamp": "2025-09-30 22:07:36.217256", "step": 1250, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:36.252409", "step": 1250, "epoch": 3 }, { "type": "loss", "content": 0.0003256227064412087, "timestamp": "2025-09-30 22:07:36.260120", "step": 1251, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:36.295118", "step": 1251, "epoch": 3 }, { "type": "loss", "content": 0.0035193650983273983, "timestamp": "2025-09-30 22:07:36.320782", "step": 1252, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:36.354257", "step": 1252, "epoch": 3 }, { "type": "loss", "content": 0.0034225154668092728, "timestamp": "2025-09-30 22:07:36.356290", "step": 1253, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:36.395143", "step": 1253, "epoch": 3 }, { "type": "loss", "content": 0.028722798451781273, "timestamp": "2025-09-30 22:07:36.400020", "step": 1254, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:36.432660", "step": 1254, "epoch": 3 }, { "type": "loss", "content": 0.00042226066580042243, "timestamp": "2025-09-30 22:07:36.434794", "step": 1255, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:36.467524", "step": 1255, "epoch": 3 }, { "type": "loss", "content": 0.0007823723135516047, "timestamp": "2025-09-30 22:07:36.491012", "step": 1256, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:36.527852", "step": 1256, "epoch": 3 }, { "type": "loss", "content": 0.011989377439022064, "timestamp": "2025-09-30 22:07:36.533147", "step": 1257, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:36.566229", "step": 1257, "epoch": 3 }, { "type": "loss", "content": 0.00017260621825698763, "timestamp": "2025-09-30 22:07:36.570419", "step": 1258, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:36.603032", "step": 1258, "epoch": 3 }, { "type": "loss", "content": 0.00010432758426759392, "timestamp": "2025-09-30 22:07:36.605767", "step": 1259, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:36.637699", "step": 1259, "epoch": 3 }, { "type": "loss", "content": 0.008409126661717892, "timestamp": "2025-09-30 22:07:36.661361", "step": 1260, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:36.693688", "step": 1260, "epoch": 3 }, { "type": "loss", "content": 0.03930716961622238, "timestamp": "2025-09-30 22:07:36.695933", "step": 1261, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:36.728645", "step": 1261, "epoch": 3 }, { "type": "loss", "content": 0.004691932816058397, "timestamp": "2025-09-30 22:07:36.731368", "step": 1262, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:36.762632", "step": 1262, "epoch": 3 }, { "type": "loss", "content": 0.0003119763860013336, "timestamp": "2025-09-30 22:07:36.765588", "step": 1263, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:36.800587", "step": 1263, "epoch": 3 }, { "type": "loss", "content": 0.00028951704734936357, "timestamp": "2025-09-30 22:07:36.828500", "step": 1264, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:36.860604", "step": 1264, "epoch": 3 }, { "type": "loss", "content": 0.0010244931327179074, "timestamp": "2025-09-30 22:07:36.862339", "step": 1265, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:36.895817", "step": 1265, "epoch": 3 }, { "type": "loss", "content": 0.0013763883616775274, "timestamp": "2025-09-30 22:07:36.903016", "step": 1266, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:36.937081", "step": 1266, "epoch": 3 }, { "type": "loss", "content": 0.0009086270001716912, "timestamp": "2025-09-30 22:07:36.944023", "step": 1267, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:36.975857", "step": 1267, "epoch": 3 }, { "type": "loss", "content": 0.004262962378561497, "timestamp": "2025-09-30 22:07:36.999675", "step": 1268, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:37.031985", "step": 1268, "epoch": 3 }, { "type": "loss", "content": 0.02412317879498005, "timestamp": "2025-09-30 22:07:37.033995", "step": 1269, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:37.067156", "step": 1269, "epoch": 3 }, { "type": "loss", "content": 0.02252691611647606, "timestamp": "2025-09-30 22:07:37.071793", "step": 1270, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:37.108421", "step": 1270, "epoch": 3 }, { "type": "loss", "content": 0.00012152874114690349, "timestamp": "2025-09-30 22:07:37.115419", "step": 1271, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:37.149492", "step": 1271, "epoch": 3 }, { "type": "loss", "content": 0.00133028463460505, "timestamp": "2025-09-30 22:07:37.178293", "step": 1272, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:37.211271", "step": 1272, "epoch": 3 }, { "type": "loss", "content": 0.022769641131162643, "timestamp": "2025-09-30 22:07:37.213314", "step": 1273, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:37.249613", "step": 1273, "epoch": 3 }, { "type": "loss", "content": 0.012292742729187012, "timestamp": "2025-09-30 22:07:37.253055", "step": 1274, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:37.287530", "step": 1274, "epoch": 3 }, { "type": "loss", "content": 0.005768026225268841, "timestamp": "2025-09-30 22:07:37.294740", "step": 1275, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:37.329256", "step": 1275, "epoch": 3 }, { "type": "loss", "content": 0.00015319335216190666, "timestamp": "2025-09-30 22:07:37.354968", "step": 1276, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:37.396114", "step": 1276, "epoch": 3 }, { "type": "loss", "content": 9.743180271470919e-05, "timestamp": "2025-09-30 22:07:37.398099", "step": 1277, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:37.431604", "step": 1277, "epoch": 3 }, { "type": "loss", "content": 0.006411910522729158, "timestamp": "2025-09-30 22:07:37.433438", "step": 1278, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:37.466660", "step": 1278, "epoch": 3 }, { "type": "loss", "content": 0.010445569641888142, "timestamp": "2025-09-30 22:07:37.473756", "step": 1279, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:37.506255", "step": 1279, "epoch": 3 }, { "type": "loss", "content": 0.004939712584018707, "timestamp": "2025-09-30 22:07:37.534338", "step": 1280, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:37.566932", "step": 1280, "epoch": 3 }, { "type": "loss", "content": 0.000179723632754758, "timestamp": "2025-09-30 22:07:37.569247", "step": 1281, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:37.601976", "step": 1281, "epoch": 3 }, { "type": "loss", "content": 0.006414394360035658, "timestamp": "2025-09-30 22:07:37.604761", "step": 1282, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:37.636239", "step": 1282, "epoch": 3 }, { "type": "loss", "content": 0.01634913496673107, "timestamp": "2025-09-30 22:07:37.638114", "step": 1283, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:37.669132", "step": 1283, "epoch": 3 }, { "type": "loss", "content": 0.0030618479941040277, "timestamp": "2025-09-30 22:07:37.692933", "step": 1284, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:37.724375", "step": 1284, "epoch": 3 }, { "type": "loss", "content": 0.014753976836800575, "timestamp": "2025-09-30 22:07:37.726062", "step": 1285, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:37.757327", "step": 1285, "epoch": 3 }, { "type": "loss", "content": 0.00043953536078333855, "timestamp": "2025-09-30 22:07:37.761731", "step": 1286, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:37.794682", "step": 1286, "epoch": 3 }, { "type": "loss", "content": 0.0008979348349384964, "timestamp": "2025-09-30 22:07:37.801834", "step": 1287, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:38.470861", "step": 1287, "epoch": 3 }, { "type": "pplx", "content": 109989573.47868328, "timestamp": "2025-09-30 22:07:38.473187", "step": 1287, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:38.508925", "step": 1287, "epoch": 3 }, { "type": "loss", "content": 0.023780453950166702, "timestamp": "2025-09-30 22:07:38.534259", "step": 1288, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:38.569925", "step": 1288, "epoch": 3 }, { "type": "loss", "content": 0.0005712545826099813, "timestamp": "2025-09-30 22:07:38.572178", "step": 1289, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:38.604571", "step": 1289, "epoch": 3 }, { "type": "loss", "content": 0.014848102815449238, "timestamp": "2025-09-30 22:07:38.611884", "step": 1290, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:38.661965", "step": 1290, "epoch": 3 }, { "type": "loss", "content": 0.0018235408933833241, "timestamp": "2025-09-30 22:07:38.664575", "step": 1291, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:38.697886", "step": 1291, "epoch": 3 }, { "type": "loss", "content": 0.0031597481574863195, "timestamp": "2025-09-30 22:07:38.723546", "step": 1292, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:38.764873", "step": 1292, "epoch": 3 }, { "type": "loss", "content": 0.0017853927565738559, "timestamp": "2025-09-30 22:07:38.767578", "step": 1293, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:38.803281", "step": 1293, "epoch": 3 }, { "type": "loss", "content": 0.0030321648810058832, "timestamp": "2025-09-30 22:07:38.806057", "step": 1294, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:38.841128", "step": 1294, "epoch": 3 }, { "type": "loss", "content": 0.002925761044025421, "timestamp": "2025-09-30 22:07:38.843318", "step": 1295, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:38.878262", "step": 1295, "epoch": 3 }, { "type": "loss", "content": 0.0015968728112056851, "timestamp": "2025-09-30 22:07:38.901759", "step": 1296, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:38.943013", "step": 1296, "epoch": 3 }, { "type": "loss", "content": 0.005096987821161747, "timestamp": "2025-09-30 22:07:38.945453", "step": 1297, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:38.982097", "step": 1297, "epoch": 3 }, { "type": "loss", "content": 0.0007914546877145767, "timestamp": "2025-09-30 22:07:38.984606", "step": 1298, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:39.019090", "step": 1298, "epoch": 3 }, { "type": "loss", "content": 0.0022949494887143373, "timestamp": "2025-09-30 22:07:39.021437", "step": 1299, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:39.062138", "step": 1299, "epoch": 3 }, { "type": "loss", "content": 0.000646329834125936, "timestamp": "2025-09-30 22:07:39.086371", "step": 1300, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:39.119570", "step": 1300, "epoch": 3 }, { "type": "loss", "content": 0.006533071864396334, "timestamp": "2025-09-30 22:07:39.121936", "step": 1301, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:39.155908", "step": 1301, "epoch": 3 }, { "type": "loss", "content": 0.0032184135634452105, "timestamp": "2025-09-30 22:07:39.158607", "step": 1302, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:39.192997", "step": 1302, "epoch": 3 }, { "type": "loss", "content": 0.0008461990510113537, "timestamp": "2025-09-30 22:07:39.195669", "step": 1303, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:39.229134", "step": 1303, "epoch": 3 }, { "type": "loss", "content": 0.01580544374883175, "timestamp": "2025-09-30 22:07:39.253492", "step": 1304, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:39.286631", "step": 1304, "epoch": 3 }, { "type": "loss", "content": 0.015919901430606842, "timestamp": "2025-09-30 22:07:39.288786", "step": 1305, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:39.322651", "step": 1305, "epoch": 3 }, { "type": "loss", "content": 0.033842675387859344, "timestamp": "2025-09-30 22:07:39.325255", "step": 1306, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:39.357476", "step": 1306, "epoch": 3 }, { "type": "loss", "content": 0.003096401458606124, "timestamp": "2025-09-30 22:07:39.360347", "step": 1307, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:39.394566", "step": 1307, "epoch": 3 }, { "type": "loss", "content": 0.0005393567844294012, "timestamp": "2025-09-30 22:07:39.420354", "step": 1308, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:39.453395", "step": 1308, "epoch": 3 }, { "type": "loss", "content": 0.0008734531002119184, "timestamp": "2025-09-30 22:07:39.456644", "step": 1309, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:39.488834", "step": 1309, "epoch": 3 }, { "type": "loss", "content": 0.002081485465168953, "timestamp": "2025-09-30 22:07:39.493369", "step": 1310, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:39.526571", "step": 1310, "epoch": 3 }, { "type": "loss", "content": 0.011021971702575684, "timestamp": "2025-09-30 22:07:39.530759", "step": 1311, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:39.565568", "step": 1311, "epoch": 3 }, { "type": "loss", "content": 0.00900907814502716, "timestamp": "2025-09-30 22:07:39.589812", "step": 1312, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:39.620721", "step": 1312, "epoch": 3 }, { "type": "loss", "content": 0.011516647413372993, "timestamp": "2025-09-30 22:07:39.624276", "step": 1313, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:39.657077", "step": 1313, "epoch": 3 }, { "type": "loss", "content": 0.000806538388133049, "timestamp": "2025-09-30 22:07:39.658901", "step": 1314, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:39.691156", "step": 1314, "epoch": 3 }, { "type": "loss", "content": 0.010637268424034119, "timestamp": "2025-09-30 22:07:39.695673", "step": 1315, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:39.727487", "step": 1315, "epoch": 3 }, { "type": "loss", "content": 0.0007261876598931849, "timestamp": "2025-09-30 22:07:39.750945", "step": 1316, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:39.783255", "step": 1316, "epoch": 3 }, { "type": "loss", "content": 0.002348328474909067, "timestamp": "2025-09-30 22:07:39.785684", "step": 1317, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:39.818553", "step": 1317, "epoch": 3 }, { "type": "loss", "content": 0.005363943986594677, "timestamp": "2025-09-30 22:07:39.821258", "step": 1318, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:39.853727", "step": 1318, "epoch": 3 }, { "type": "loss", "content": 0.00031056805164553225, "timestamp": "2025-09-30 22:07:39.856008", "step": 1319, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:39.887622", "step": 1319, "epoch": 3 }, { "type": "loss", "content": 0.004423830658197403, "timestamp": "2025-09-30 22:07:39.912275", "step": 1320, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:39.945415", "step": 1320, "epoch": 3 }, { "type": "loss", "content": 8.320227061631158e-05, "timestamp": "2025-09-30 22:07:39.948184", "step": 1321, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:39.980603", "step": 1321, "epoch": 3 }, { "type": "loss", "content": 0.0002933432988356799, "timestamp": "2025-09-30 22:07:39.985405", "step": 1322, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:40.018838", "step": 1322, "epoch": 3 }, { "type": "loss", "content": 0.0042983428575098515, "timestamp": "2025-09-30 22:07:40.022003", "step": 1323, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:40.056300", "step": 1323, "epoch": 3 }, { "type": "loss", "content": 0.014138293452560902, "timestamp": "2025-09-30 22:07:40.084488", "step": 1324, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:40.118024", "step": 1324, "epoch": 3 }, { "type": "loss", "content": 0.03766119107604027, "timestamp": "2025-09-30 22:07:40.123535", "step": 1325, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:40.157052", "step": 1325, "epoch": 3 }, { "type": "loss", "content": 0.001065706484951079, "timestamp": "2025-09-30 22:07:40.161626", "step": 1326, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:40.814410", "step": 1326, "epoch": 3 }, { "type": "pplx", "content": 106030059.01171215, "timestamp": "2025-09-30 22:07:40.816322", "step": 1326, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:40.846018", "step": 1326, "epoch": 3 }, { "type": "loss", "content": 0.0014224022161215544, "timestamp": "2025-09-30 22:07:40.853213", "step": 1327, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:40.886027", "step": 1327, "epoch": 3 }, { "type": "loss", "content": 0.0001248378393938765, "timestamp": "2025-09-30 22:07:40.911756", "step": 1328, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:40.944563", "step": 1328, "epoch": 3 }, { "type": "loss", "content": 0.00013765999756287783, "timestamp": "2025-09-30 22:07:40.950013", "step": 1329, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:40.986025", "step": 1329, "epoch": 3 }, { "type": "loss", "content": 0.003243004670366645, "timestamp": "2025-09-30 22:07:40.990606", "step": 1330, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:41.021649", "step": 1330, "epoch": 3 }, { "type": "loss", "content": 0.0005152979865670204, "timestamp": "2025-09-30 22:07:41.029170", "step": 1331, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:41.064293", "step": 1331, "epoch": 3 }, { "type": "loss", "content": 0.0005544818704947829, "timestamp": "2025-09-30 22:07:41.088257", "step": 1332, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:41.121724", "step": 1332, "epoch": 3 }, { "type": "loss", "content": 0.001916523789986968, "timestamp": "2025-09-30 22:07:41.127255", "step": 1333, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:41.159608", "step": 1333, "epoch": 3 }, { "type": "loss", "content": 0.0005377818597480655, "timestamp": "2025-09-30 22:07:41.161126", "step": 1334, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:41.193703", "step": 1334, "epoch": 3 }, { "type": "loss", "content": 0.0011306932428851724, "timestamp": "2025-09-30 22:07:41.200986", "step": 1335, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:41.232954", "step": 1335, "epoch": 3 }, { "type": "loss", "content": 0.0022087269462645054, "timestamp": "2025-09-30 22:07:41.258171", "step": 1336, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:41.291315", "step": 1336, "epoch": 3 }, { "type": "loss", "content": 0.00083199079381302, "timestamp": "2025-09-30 22:07:41.293552", "step": 1337, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:41.327551", "step": 1337, "epoch": 3 }, { "type": "loss", "content": 0.0007499647326767445, "timestamp": "2025-09-30 22:07:41.332457", "step": 1338, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:41.368792", "step": 1338, "epoch": 3 }, { "type": "loss", "content": 0.02916637435555458, "timestamp": "2025-09-30 22:07:41.373358", "step": 1339, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:41.410206", "step": 1339, "epoch": 3 }, { "type": "loss", "content": 0.008801432326436043, "timestamp": "2025-09-30 22:07:41.436123", "step": 1340, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:41.467956", "step": 1340, "epoch": 3 }, { "type": "loss", "content": 0.0003800613048952073, "timestamp": "2025-09-30 22:07:41.470444", "step": 1341, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:41.502289", "step": 1341, "epoch": 3 }, { "type": "loss", "content": 0.00023526222503278404, "timestamp": "2025-09-30 22:07:41.506904", "step": 1342, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:41.537764", "step": 1342, "epoch": 3 }, { "type": "loss", "content": 0.0013486042153090239, "timestamp": "2025-09-30 22:07:41.539519", "step": 1343, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:41.571710", "step": 1343, "epoch": 3 }, { "type": "loss", "content": 0.0008594534010626376, "timestamp": "2025-09-30 22:07:41.594882", "step": 1344, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:41.628068", "step": 1344, "epoch": 3 }, { "type": "loss", "content": 0.002135425340384245, "timestamp": "2025-09-30 22:07:41.633603", "step": 1345, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:41.669158", "step": 1345, "epoch": 3 }, { "type": "loss", "content": 0.0004654375952668488, "timestamp": "2025-09-30 22:07:41.672158", "step": 1346, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:41.707133", "step": 1346, "epoch": 3 }, { "type": "loss", "content": 0.0007671394268982112, "timestamp": "2025-09-30 22:07:41.714176", "step": 1347, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:41.748009", "step": 1347, "epoch": 3 }, { "type": "loss", "content": 0.00014771531277801841, "timestamp": "2025-09-30 22:07:41.773750", "step": 1348, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:41.807576", "step": 1348, "epoch": 3 }, { "type": "loss", "content": 0.0006639771163463593, "timestamp": "2025-09-30 22:07:41.809571", "step": 1349, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:41.843263", "step": 1349, "epoch": 3 }, { "type": "loss", "content": 0.0008383739041164517, "timestamp": "2025-09-30 22:07:41.847870", "step": 1350, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:41.879607", "step": 1350, "epoch": 3 }, { "type": "loss", "content": 0.00017002799722831696, "timestamp": "2025-09-30 22:07:41.887532", "step": 1351, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:41.919217", "step": 1351, "epoch": 3 }, { "type": "loss", "content": 0.0007719314889982343, "timestamp": "2025-09-30 22:07:41.942902", "step": 1352, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:41.975422", "step": 1352, "epoch": 3 }, { "type": "loss", "content": 0.0007648206083104014, "timestamp": "2025-09-30 22:07:41.977609", "step": 1353, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:42.010502", "step": 1353, "epoch": 3 }, { "type": "loss", "content": 0.0007910222630016506, "timestamp": "2025-09-30 22:07:42.012591", "step": 1354, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:42.043880", "step": 1354, "epoch": 3 }, { "type": "loss", "content": 0.0020354478619992733, "timestamp": "2025-09-30 22:07:42.045704", "step": 1355, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:42.077166", "step": 1355, "epoch": 3 }, { "type": "loss", "content": 0.004846019204705954, "timestamp": "2025-09-30 22:07:42.102588", "step": 1356, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:42.135357", "step": 1356, "epoch": 3 }, { "type": "loss", "content": 0.004001092631369829, "timestamp": "2025-09-30 22:07:42.137610", "step": 1357, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:42.176312", "step": 1357, "epoch": 3 }, { "type": "loss", "content": 0.0003547620144672692, "timestamp": "2025-09-30 22:07:42.178435", "step": 1358, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:42.211823", "step": 1358, "epoch": 3 }, { "type": "loss", "content": 0.001670529949478805, "timestamp": "2025-09-30 22:07:42.216033", "step": 1359, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:42.248214", "step": 1359, "epoch": 3 }, { "type": "loss", "content": 0.020056426525115967, "timestamp": "2025-09-30 22:07:42.276989", "step": 1360, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:42.309095", "step": 1360, "epoch": 3 }, { "type": "loss", "content": 0.0006083712796680629, "timestamp": "2025-09-30 22:07:42.311340", "step": 1361, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:42.344216", "step": 1361, "epoch": 3 }, { "type": "loss", "content": 0.0007033172878436744, "timestamp": "2025-09-30 22:07:42.348786", "step": 1362, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:42.381169", "step": 1362, "epoch": 3 }, { "type": "loss", "content": 0.0025410365778952837, "timestamp": "2025-09-30 22:07:42.383295", "step": 1363, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:42.418777", "step": 1363, "epoch": 3 }, { "type": "loss", "content": 0.0006478140712715685, "timestamp": "2025-09-30 22:07:42.446782", "step": 1364, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:42.478821", "step": 1364, "epoch": 3 }, { "type": "loss", "content": 0.000585120462346822, "timestamp": "2025-09-30 22:07:42.480984", "step": 1365, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:43.129212", "step": 1365, "epoch": 3 }, { "type": "pplx", "content": 109053889.68612264, "timestamp": "2025-09-30 22:07:43.130865", "step": 1365, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 288 ], "flops": 8543129804160 }, "timestamp": "2025-09-30 22:07:43.161899", "step": 1365, "epoch": 3 }, { "type": "loss", "content": 0.0001839351753005758, "timestamp": "2025-09-30 22:07:43.172005", "step": 1366, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:43.206699", "step": 1366, "epoch": 3 }, { "type": "loss", "content": 0.0008755376911722124, "timestamp": "2025-09-30 22:07:43.211246", "step": 1367, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:43.245998", "step": 1367, "epoch": 3 }, { "type": "loss", "content": 0.0005775559693574905, "timestamp": "2025-09-30 22:07:43.269389", "step": 1368, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:43.302158", "step": 1368, "epoch": 3 }, { "type": "loss", "content": 0.0018166237277910113, "timestamp": "2025-09-30 22:07:43.304484", "step": 1369, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:43.337709", "step": 1369, "epoch": 3 }, { "type": "loss", "content": 0.0011054730275645852, "timestamp": "2025-09-30 22:07:43.344899", "step": 1370, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:43.379219", "step": 1370, "epoch": 3 }, { "type": "loss", "content": 0.002334549557417631, "timestamp": "2025-09-30 22:07:43.382150", "step": 1371, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:43.415628", "step": 1371, "epoch": 3 }, { "type": "loss", "content": 0.000506529351696372, "timestamp": "2025-09-30 22:07:43.441137", "step": 1372, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:43.474812", "step": 1372, "epoch": 3 }, { "type": "loss", "content": 0.00016645164578221738, "timestamp": "2025-09-30 22:07:43.477135", "step": 1373, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:43.512473", "step": 1373, "epoch": 3 }, { "type": "loss", "content": 0.0001811900729080662, "timestamp": "2025-09-30 22:07:43.514980", "step": 1374, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:43.546709", "step": 1374, "epoch": 3 }, { "type": "loss", "content": 0.001400345703586936, "timestamp": "2025-09-30 22:07:43.551305", "step": 1375, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:07:43.583104", "step": 1375, "epoch": 3 }, { "type": "loss", "content": 0.00017988981562666595, "timestamp": "2025-09-30 22:07:43.614522", "step": 1376, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:43.646821", "step": 1376, "epoch": 3 }, { "type": "loss", "content": 0.000209273915970698, "timestamp": "2025-09-30 22:07:43.649104", "step": 1377, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:43.682137", "step": 1377, "epoch": 3 }, { "type": "loss", "content": 0.0006141887861303985, "timestamp": "2025-09-30 22:07:43.684330", "step": 1378, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:43.717491", "step": 1378, "epoch": 3 }, { "type": "loss", "content": 0.0010347587522119284, "timestamp": "2025-09-30 22:07:43.721848", "step": 1379, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:43.754013", "step": 1379, "epoch": 3 }, { "type": "loss", "content": 0.00014966915477998555, "timestamp": "2025-09-30 22:07:43.782992", "step": 1380, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:43.816666", "step": 1380, "epoch": 3 }, { "type": "loss", "content": 0.014756555669009686, "timestamp": "2025-09-30 22:07:43.821789", "step": 1381, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:43.854140", "step": 1381, "epoch": 3 }, { "type": "loss", "content": 0.00017656719137448817, "timestamp": "2025-09-30 22:07:43.858815", "step": 1382, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:43.891147", "step": 1382, "epoch": 3 }, { "type": "loss", "content": 0.011107811704277992, "timestamp": "2025-09-30 22:07:43.893491", "step": 1383, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:43.925131", "step": 1383, "epoch": 3 }, { "type": "loss", "content": 0.0002206839999416843, "timestamp": "2025-09-30 22:07:43.949472", "step": 1384, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:43.982366", "step": 1384, "epoch": 3 }, { "type": "loss", "content": 0.010964823886752129, "timestamp": "2025-09-30 22:07:43.984532", "step": 1385, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:44.017959", "step": 1385, "epoch": 3 }, { "type": "loss", "content": 7.031839777482674e-05, "timestamp": "2025-09-30 22:07:44.020280", "step": 1386, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:44.053815", "step": 1386, "epoch": 3 }, { "type": "loss", "content": 0.000803628470748663, "timestamp": "2025-09-30 22:07:44.056177", "step": 1387, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:44.096237", "step": 1387, "epoch": 3 }, { "type": "loss", "content": 0.0005003432161174715, "timestamp": "2025-09-30 22:07:44.125286", "step": 1388, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:44.164569", "step": 1388, "epoch": 3 }, { "type": "loss", "content": 0.0014421312371268868, "timestamp": "2025-09-30 22:07:44.166908", "step": 1389, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:44.198649", "step": 1389, "epoch": 3 }, { "type": "loss", "content": 0.0009203127701766789, "timestamp": "2025-09-30 22:07:44.200711", "step": 1390, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:44.233083", "step": 1390, "epoch": 3 }, { "type": "loss", "content": 0.001186627778224647, "timestamp": "2025-09-30 22:07:44.235276", "step": 1391, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:44.271707", "step": 1391, "epoch": 3 }, { "type": "loss", "content": 0.00014718440070282668, "timestamp": "2025-09-30 22:07:44.295260", "step": 1392, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:44.333181", "step": 1392, "epoch": 3 }, { "type": "loss", "content": 0.00036089521017856896, "timestamp": "2025-09-30 22:07:44.335507", "step": 1393, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:44.367482", "step": 1393, "epoch": 3 }, { "type": "loss", "content": 6.875755207147449e-05, "timestamp": "2025-09-30 22:07:44.371873", "step": 1394, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:44.404487", "step": 1394, "epoch": 3 }, { "type": "loss", "content": 0.0001918337366078049, "timestamp": "2025-09-30 22:07:44.408881", "step": 1395, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:44.450911", "step": 1395, "epoch": 3 }, { "type": "loss", "content": 0.00034902142942883074, "timestamp": "2025-09-30 22:07:44.476425", "step": 1396, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:44.523212", "step": 1396, "epoch": 3 }, { "type": "loss", "content": 0.0007774402620270848, "timestamp": "2025-09-30 22:07:44.525276", "step": 1397, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:44.560153", "step": 1397, "epoch": 3 }, { "type": "loss", "content": 0.0013828465016558766, "timestamp": "2025-09-30 22:07:44.562494", "step": 1398, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:44.595406", "step": 1398, "epoch": 3 }, { "type": "loss", "content": 0.015292760916054249, "timestamp": "2025-09-30 22:07:44.598139", "step": 1399, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:44.643739", "step": 1399, "epoch": 3 }, { "type": "loss", "content": 0.000216073400224559, "timestamp": "2025-09-30 22:07:44.669046", "step": 1400, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:44.704594", "step": 1400, "epoch": 3 }, { "type": "loss", "content": 0.0016167605062946677, "timestamp": "2025-09-30 22:07:44.706908", "step": 1401, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:44.750476", "step": 1401, "epoch": 3 }, { "type": "loss", "content": 7.860381447244436e-05, "timestamp": "2025-09-30 22:07:44.757450", "step": 1402, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:44.800280", "step": 1402, "epoch": 3 }, { "type": "loss", "content": 0.0003883039462380111, "timestamp": "2025-09-30 22:07:44.802361", "step": 1403, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:44.842323", "step": 1403, "epoch": 3 }, { "type": "loss", "content": 0.0006712899194099009, "timestamp": "2025-09-30 22:07:44.868005", "step": 1404, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:45.656702", "step": 1404, "epoch": 3 }, { "type": "pplx", "content": 111324626.72600262, "timestamp": "2025-09-30 22:07:45.658624", "step": 1404, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:45.688457", "step": 1404, "epoch": 3 }, { "type": "loss", "content": 0.0016097808256745338, "timestamp": "2025-09-30 22:07:45.691034", "step": 1405, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:45.728705", "step": 1405, "epoch": 3 }, { "type": "loss", "content": 0.00021898788691032678, "timestamp": "2025-09-30 22:07:45.735867", "step": 1406, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:45.770158", "step": 1406, "epoch": 3 }, { "type": "loss", "content": 0.0016417077276855707, "timestamp": "2025-09-30 22:07:45.774112", "step": 1407, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:45.808998", "step": 1407, "epoch": 3 }, { "type": "loss", "content": 0.00014548443141393363, "timestamp": "2025-09-30 22:07:45.832929", "step": 1408, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:45.868456", "step": 1408, "epoch": 3 }, { "type": "loss", "content": 0.002540131565183401, "timestamp": "2025-09-30 22:07:45.873402", "step": 1409, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:45.907205", "step": 1409, "epoch": 3 }, { "type": "loss", "content": 0.0002596612030174583, "timestamp": "2025-09-30 22:07:45.911392", "step": 1410, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:45.942423", "step": 1410, "epoch": 3 }, { "type": "loss", "content": 0.0001415082806488499, "timestamp": "2025-09-30 22:07:45.944951", "step": 1411, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:45.980839", "step": 1411, "epoch": 3 }, { "type": "loss", "content": 0.0008175332914106548, "timestamp": "2025-09-30 22:07:46.005676", "step": 1412, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:46.037564", "step": 1412, "epoch": 3 }, { "type": "loss", "content": 5.09199126099702e-05, "timestamp": "2025-09-30 22:07:46.039734", "step": 1413, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:46.072284", "step": 1413, "epoch": 3 }, { "type": "loss", "content": 0.056001435965299606, "timestamp": "2025-09-30 22:07:46.075028", "step": 1414, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:46.116722", "step": 1414, "epoch": 3 }, { "type": "loss", "content": 0.0008140717982314527, "timestamp": "2025-09-30 22:07:46.124299", "step": 1415, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:46.174037", "step": 1415, "epoch": 3 }, { "type": "loss", "content": 0.01871386542916298, "timestamp": "2025-09-30 22:07:46.198975", "step": 1416, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:46.239964", "step": 1416, "epoch": 3 }, { "type": "loss", "content": 0.0017616671975702047, "timestamp": "2025-09-30 22:07:46.241845", "step": 1417, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:46.282331", "step": 1417, "epoch": 3 }, { "type": "loss", "content": 5.8603611250873655e-05, "timestamp": "2025-09-30 22:07:46.289616", "step": 1418, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:46.327101", "step": 1418, "epoch": 3 }, { "type": "loss", "content": 0.00041110877646133304, "timestamp": "2025-09-30 22:07:46.331307", "step": 1419, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:46.369163", "step": 1419, "epoch": 3 }, { "type": "loss", "content": 8.381939551327378e-05, "timestamp": "2025-09-30 22:07:46.392799", "step": 1420, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:46.425008", "step": 1420, "epoch": 3 }, { "type": "loss", "content": 0.00018263938545715064, "timestamp": "2025-09-30 22:07:46.427179", "step": 1421, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:46.458869", "step": 1421, "epoch": 3 }, { "type": "loss", "content": 0.00024420252884738147, "timestamp": "2025-09-30 22:07:46.461665", "step": 1422, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:46.493460", "step": 1422, "epoch": 3 }, { "type": "loss", "content": 0.0002778305788524449, "timestamp": "2025-09-30 22:07:46.499142", "step": 1423, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:46.532510", "step": 1423, "epoch": 3 }, { "type": "loss", "content": 6.686317647108808e-05, "timestamp": "2025-09-30 22:07:46.556191", "step": 1424, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:46.590168", "step": 1424, "epoch": 3 }, { "type": "loss", "content": 0.00014924805145710707, "timestamp": "2025-09-30 22:07:46.592375", "step": 1425, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:46.623259", "step": 1425, "epoch": 3 }, { "type": "loss", "content": 0.00018138554878532887, "timestamp": "2025-09-30 22:07:46.625203", "step": 1426, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:46.660949", "step": 1426, "epoch": 3 }, { "type": "loss", "content": 0.0007415400468744338, "timestamp": "2025-09-30 22:07:46.665233", "step": 1427, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:46.695568", "step": 1427, "epoch": 3 }, { "type": "loss", "content": 0.005260678939521313, "timestamp": "2025-09-30 22:07:46.724287", "step": 1428, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:46.756642", "step": 1428, "epoch": 3 }, { "type": "loss", "content": 0.00022222170082386583, "timestamp": "2025-09-30 22:07:46.759042", "step": 1429, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:46.794819", "step": 1429, "epoch": 3 }, { "type": "loss", "content": 0.000168702652445063, "timestamp": "2025-09-30 22:07:46.797581", "step": 1430, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:46.841723", "step": 1430, "epoch": 3 }, { "type": "loss", "content": 0.0005504986620508134, "timestamp": "2025-09-30 22:07:46.843896", "step": 1431, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:46.878320", "step": 1431, "epoch": 3 }, { "type": "loss", "content": 0.0018384003778919578, "timestamp": "2025-09-30 22:07:46.903319", "step": 1432, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:46.936905", "step": 1432, "epoch": 3 }, { "type": "loss", "content": 0.007305443286895752, "timestamp": "2025-09-30 22:07:46.940047", "step": 1433, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:46.974532", "step": 1433, "epoch": 3 }, { "type": "loss", "content": 0.00016536428302060813, "timestamp": "2025-09-30 22:07:46.981490", "step": 1434, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:47.014908", "step": 1434, "epoch": 3 }, { "type": "loss", "content": 0.00028782375738956034, "timestamp": "2025-09-30 22:07:47.019297", "step": 1435, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:47.052292", "step": 1435, "epoch": 3 }, { "type": "loss", "content": 0.00013649475295096636, "timestamp": "2025-09-30 22:07:47.077558", "step": 1436, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:47.109343", "step": 1436, "epoch": 3 }, { "type": "loss", "content": 0.00019715628877747804, "timestamp": "2025-09-30 22:07:47.111552", "step": 1437, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:47.142209", "step": 1437, "epoch": 3 }, { "type": "loss", "content": 0.001548089669086039, "timestamp": "2025-09-30 22:07:47.144674", "step": 1438, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:47.176869", "step": 1438, "epoch": 3 }, { "type": "loss", "content": 0.00021238917543087155, "timestamp": "2025-09-30 22:07:47.179722", "step": 1439, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:47.210562", "step": 1439, "epoch": 3 }, { "type": "loss", "content": 0.0016702745342627168, "timestamp": "2025-09-30 22:07:47.235882", "step": 1440, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:47.267065", "step": 1440, "epoch": 3 }, { "type": "loss", "content": 7.280301360879093e-05, "timestamp": "2025-09-30 22:07:47.270100", "step": 1441, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:47.302565", "step": 1441, "epoch": 3 }, { "type": "loss", "content": 0.011224872432649136, "timestamp": "2025-09-30 22:07:47.307249", "step": 1442, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:47.340103", "step": 1442, "epoch": 3 }, { "type": "loss", "content": 0.0005737360916100442, "timestamp": "2025-09-30 22:07:47.344784", "step": 1443, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:48.038441", "step": 1443, "epoch": 3 }, { "type": "pplx", "content": 110657800.76337399, "timestamp": "2025-09-30 22:07:48.040960", "step": 1443, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:48.073885", "step": 1443, "epoch": 3 }, { "type": "loss", "content": 0.00037595274625346065, "timestamp": "2025-09-30 22:07:48.097896", "step": 1444, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:48.131159", "step": 1444, "epoch": 3 }, { "type": "loss", "content": 5.4645923228235915e-05, "timestamp": "2025-09-30 22:07:48.136051", "step": 1445, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:48.168214", "step": 1445, "epoch": 3 }, { "type": "loss", "content": 0.00010486804967513308, "timestamp": "2025-09-30 22:07:48.175111", "step": 1446, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:48.210247", "step": 1446, "epoch": 3 }, { "type": "loss", "content": 0.0005528793553821743, "timestamp": "2025-09-30 22:07:48.217190", "step": 1447, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:48.253649", "step": 1447, "epoch": 3 }, { "type": "loss", "content": 0.00015464976604562253, "timestamp": "2025-09-30 22:07:48.278708", "step": 1448, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:48.314094", "step": 1448, "epoch": 3 }, { "type": "loss", "content": 0.0005388990975916386, "timestamp": "2025-09-30 22:07:48.316676", "step": 1449, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:48.350531", "step": 1449, "epoch": 3 }, { "type": "loss", "content": 0.04156193509697914, "timestamp": "2025-09-30 22:07:48.353251", "step": 1450, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:48.394087", "step": 1450, "epoch": 3 }, { "type": "loss", "content": 0.0002184375043725595, "timestamp": "2025-09-30 22:07:48.396812", "step": 1451, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:48.430159", "step": 1451, "epoch": 3 }, { "type": "loss", "content": 0.0006777126109227538, "timestamp": "2025-09-30 22:07:48.454481", "step": 1452, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:48.491447", "step": 1452, "epoch": 3 }, { "type": "loss", "content": 0.01857731305062771, "timestamp": "2025-09-30 22:07:48.493951", "step": 1453, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:48.529874", "step": 1453, "epoch": 3 }, { "type": "loss", "content": 0.006827401462942362, "timestamp": "2025-09-30 22:07:48.536715", "step": 1454, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:48.572067", "step": 1454, "epoch": 3 }, { "type": "loss", "content": 0.000535592611413449, "timestamp": "2025-09-30 22:07:48.574354", "step": 1455, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:48.605918", "step": 1455, "epoch": 3 }, { "type": "loss", "content": 0.0012004311429336667, "timestamp": "2025-09-30 22:07:48.629656", "step": 1456, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:48.663068", "step": 1456, "epoch": 3 }, { "type": "loss", "content": 0.0042284573428332806, "timestamp": "2025-09-30 22:07:48.667190", "step": 1457, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:48.700364", "step": 1457, "epoch": 3 }, { "type": "loss", "content": 0.0034628177527338266, "timestamp": "2025-09-30 22:07:48.703026", "step": 1458, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:48.742729", "step": 1458, "epoch": 3 }, { "type": "loss", "content": 0.00019150369917042553, "timestamp": "2025-09-30 22:07:48.747333", "step": 1459, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:48.781159", "step": 1459, "epoch": 3 }, { "type": "loss", "content": 0.013045864179730415, "timestamp": "2025-09-30 22:07:48.806102", "step": 1460, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:48.837908", "step": 1460, "epoch": 3 }, { "type": "loss", "content": 0.00012074068217771128, "timestamp": "2025-09-30 22:07:48.840251", "step": 1461, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:48.877361", "step": 1461, "epoch": 3 }, { "type": "loss", "content": 8.07570613687858e-05, "timestamp": "2025-09-30 22:07:48.880079", "step": 1462, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:48.910600", "step": 1462, "epoch": 3 }, { "type": "loss", "content": 0.00021056282275822014, "timestamp": "2025-09-30 22:07:48.912779", "step": 1463, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:48.946052", "step": 1463, "epoch": 3 }, { "type": "loss", "content": 0.006550428923219442, "timestamp": "2025-09-30 22:07:48.975086", "step": 1464, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:49.006721", "step": 1464, "epoch": 3 }, { "type": "loss", "content": 0.000441436015535146, "timestamp": "2025-09-30 22:07:49.009015", "step": 1465, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:49.040591", "step": 1465, "epoch": 3 }, { "type": "loss", "content": 0.0004042996733915061, "timestamp": "2025-09-30 22:07:49.048783", "step": 1466, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:49.080751", "step": 1466, "epoch": 3 }, { "type": "loss", "content": 7.870148692745715e-05, "timestamp": "2025-09-30 22:07:49.085230", "step": 1467, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:07:49.118125", "step": 1467, "epoch": 3 }, { "type": "loss", "content": 0.005421972833573818, "timestamp": "2025-09-30 22:07:49.149473", "step": 1468, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:49.185397", "step": 1468, "epoch": 3 }, { "type": "loss", "content": 0.0005969242192804813, "timestamp": "2025-09-30 22:07:49.190180", "step": 1469, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:49.223876", "step": 1469, "epoch": 3 }, { "type": "loss", "content": 0.0004208089376334101, "timestamp": "2025-09-30 22:07:49.230889", "step": 1470, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:49.263973", "step": 1470, "epoch": 3 }, { "type": "loss", "content": 0.0075470260344445705, "timestamp": "2025-09-30 22:07:49.266058", "step": 1471, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:49.300310", "step": 1471, "epoch": 3 }, { "type": "loss", "content": 0.0005702655180357397, "timestamp": "2025-09-30 22:07:49.325800", "step": 1472, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:49.368781", "step": 1472, "epoch": 3 }, { "type": "loss", "content": 0.00041888977284543216, "timestamp": "2025-09-30 22:07:49.371029", "step": 1473, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:49.402474", "step": 1473, "epoch": 3 }, { "type": "loss", "content": 8.177950803656131e-05, "timestamp": "2025-09-30 22:07:49.404944", "step": 1474, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:49.437614", "step": 1474, "epoch": 3 }, { "type": "loss", "content": 7.957070920383558e-05, "timestamp": "2025-09-30 22:07:49.444752", "step": 1475, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:49.477866", "step": 1475, "epoch": 3 }, { "type": "loss", "content": 0.02284647338092327, "timestamp": "2025-09-30 22:07:49.501905", "step": 1476, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:49.538854", "step": 1476, "epoch": 3 }, { "type": "loss", "content": 0.006310875993221998, "timestamp": "2025-09-30 22:07:49.540904", "step": 1477, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:49.575727", "step": 1477, "epoch": 3 }, { "type": "loss", "content": 0.00017135801317635924, "timestamp": "2025-09-30 22:07:49.583040", "step": 1478, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:49.619405", "step": 1478, "epoch": 3 }, { "type": "loss", "content": 0.007718032691627741, "timestamp": "2025-09-30 22:07:49.626760", "step": 1479, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:49.663438", "step": 1479, "epoch": 3 }, { "type": "loss", "content": 0.00022026248916517943, "timestamp": "2025-09-30 22:07:49.687202", "step": 1480, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:49.718874", "step": 1480, "epoch": 3 }, { "type": "loss", "content": 0.0012717212084680796, "timestamp": "2025-09-30 22:07:49.720862", "step": 1481, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:49.752263", "step": 1481, "epoch": 3 }, { "type": "loss", "content": 0.001083889976143837, "timestamp": "2025-09-30 22:07:49.760158", "step": 1482, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:50.388619", "step": 1482, "epoch": 3 }, { "type": "pplx", "content": 109402435.07773964, "timestamp": "2025-09-30 22:07:50.390663", "step": 1482, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:50.421017", "step": 1482, "epoch": 3 }, { "type": "loss", "content": 0.0057127587497234344, "timestamp": "2025-09-30 22:07:50.425123", "step": 1483, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:50.457499", "step": 1483, "epoch": 3 }, { "type": "loss", "content": 0.0002413246693322435, "timestamp": "2025-09-30 22:07:50.483002", "step": 1484, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:50.520018", "step": 1484, "epoch": 3 }, { "type": "loss", "content": 0.016218166798353195, "timestamp": "2025-09-30 22:07:50.522073", "step": 1485, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:50.563361", "step": 1485, "epoch": 3 }, { "type": "loss", "content": 0.00025467827799730003, "timestamp": "2025-09-30 22:07:50.567887", "step": 1486, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:50.599267", "step": 1486, "epoch": 3 }, { "type": "loss", "content": 0.0005150886718183756, "timestamp": "2025-09-30 22:07:50.606375", "step": 1487, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:50.637765", "step": 1487, "epoch": 3 }, { "type": "loss", "content": 0.0004436791059561074, "timestamp": "2025-09-30 22:07:50.663012", "step": 1488, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:50.694451", "step": 1488, "epoch": 3 }, { "type": "loss", "content": 0.006838344968855381, "timestamp": "2025-09-30 22:07:50.698960", "step": 1489, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:50.730865", "step": 1489, "epoch": 3 }, { "type": "loss", "content": 0.015281050466001034, "timestamp": "2025-09-30 22:07:50.735240", "step": 1490, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:50.766604", "step": 1490, "epoch": 3 }, { "type": "loss", "content": 0.0014932630583643913, "timestamp": "2025-09-30 22:07:50.773774", "step": 1491, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:50.805710", "step": 1491, "epoch": 3 }, { "type": "loss", "content": 0.0003391748759895563, "timestamp": "2025-09-30 22:07:50.834430", "step": 1492, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:50.865966", "step": 1492, "epoch": 3 }, { "type": "loss", "content": 0.00013037241296842694, "timestamp": "2025-09-30 22:07:50.868409", "step": 1493, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:50.899827", "step": 1493, "epoch": 3 }, { "type": "loss", "content": 0.0005963169387541711, "timestamp": "2025-09-30 22:07:50.903989", "step": 1494, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:50.934506", "step": 1494, "epoch": 3 }, { "type": "loss", "content": 5.436746869236231e-05, "timestamp": "2025-09-30 22:07:50.941603", "step": 1495, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:50.973050", "step": 1495, "epoch": 3 }, { "type": "loss", "content": 0.00010126921552000567, "timestamp": "2025-09-30 22:07:50.996877", "step": 1496, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:51.028055", "step": 1496, "epoch": 3 }, { "type": "loss", "content": 0.0009686009143479168, "timestamp": "2025-09-30 22:07:51.030132", "step": 1497, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:51.061647", "step": 1497, "epoch": 3 }, { "type": "loss", "content": 0.0005619805306196213, "timestamp": "2025-09-30 22:07:51.064170", "step": 1498, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:51.095934", "step": 1498, "epoch": 3 }, { "type": "loss", "content": 0.0003612506261561066, "timestamp": "2025-09-30 22:07:51.098090", "step": 1499, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:51.129365", "step": 1499, "epoch": 3 }, { "type": "loss", "content": 0.008169925771653652, "timestamp": "2025-09-30 22:07:51.153105", "step": 1500, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 1500", "timestamp": "2025-09-30 22:07:56.007630", "step": 1500, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:56.056192", "step": 1500, "epoch": 3 }, { "type": "loss", "content": 0.011021244339644909, "timestamp": "2025-09-30 22:07:56.066412", "step": 1501, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:56.099565", "step": 1501, "epoch": 3 }, { "type": "loss", "content": 0.00015265133697539568, "timestamp": "2025-09-30 22:07:56.105586", "step": 1502, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:56.143258", "step": 1502, "epoch": 3 }, { "type": "loss", "content": 0.005286376923322678, "timestamp": "2025-09-30 22:07:56.150186", "step": 1503, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:56.189838", "step": 1503, "epoch": 3 }, { "type": "loss", "content": 0.00011992856889264658, "timestamp": "2025-09-30 22:07:56.219725", "step": 1504, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:56.263434", "step": 1504, "epoch": 3 }, { "type": "loss", "content": 0.006383276078850031, "timestamp": "2025-09-30 22:07:56.266521", "step": 1505, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:56.306137", "step": 1505, "epoch": 3 }, { "type": "loss", "content": 0.0003791770723182708, "timestamp": "2025-09-30 22:07:56.314141", "step": 1506, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:56.364410", "step": 1506, "epoch": 3 }, { "type": "loss", "content": 0.0002968795888591558, "timestamp": "2025-09-30 22:07:56.369283", "step": 1507, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:56.417705", "step": 1507, "epoch": 3 }, { "type": "loss", "content": 0.0003191308060195297, "timestamp": "2025-09-30 22:07:56.442819", "step": 1508, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:56.476820", "step": 1508, "epoch": 3 }, { "type": "loss", "content": 0.0005938306567259133, "timestamp": "2025-09-30 22:07:56.480008", "step": 1509, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:56.521308", "step": 1509, "epoch": 3 }, { "type": "loss", "content": 0.0016930060228332877, "timestamp": "2025-09-30 22:07:56.526990", "step": 1510, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:56.561894", "step": 1510, "epoch": 3 }, { "type": "loss", "content": 0.0003255842602811754, "timestamp": "2025-09-30 22:07:56.572108", "step": 1511, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:56.605706", "step": 1511, "epoch": 3 }, { "type": "loss", "content": 0.000320999271934852, "timestamp": "2025-09-30 22:07:56.629936", "step": 1512, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:56.662094", "step": 1512, "epoch": 3 }, { "type": "loss", "content": 0.00016117122140713036, "timestamp": "2025-09-30 22:07:56.668838", "step": 1513, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:56.707464", "step": 1513, "epoch": 3 }, { "type": "loss", "content": 0.0009694714681245387, "timestamp": "2025-09-30 22:07:56.714135", "step": 1514, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:56.753636", "step": 1514, "epoch": 3 }, { "type": "loss", "content": 0.0008338862680830061, "timestamp": "2025-09-30 22:07:56.758078", "step": 1515, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:56.799819", "step": 1515, "epoch": 3 }, { "type": "loss", "content": 8.57623599586077e-05, "timestamp": "2025-09-30 22:07:56.826578", "step": 1516, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:56.861369", "step": 1516, "epoch": 3 }, { "type": "loss", "content": 0.0021684980019927025, "timestamp": "2025-09-30 22:07:56.873671", "step": 1517, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:56.913424", "step": 1517, "epoch": 3 }, { "type": "loss", "content": 0.00260011269710958, "timestamp": "2025-09-30 22:07:56.920385", "step": 1518, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:56.959620", "step": 1518, "epoch": 3 }, { "type": "loss", "content": 0.0006647937698289752, "timestamp": "2025-09-30 22:07:56.967446", "step": 1519, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:56.999268", "step": 1519, "epoch": 3 }, { "type": "loss", "content": 0.0001569747255416587, "timestamp": "2025-09-30 22:07:57.030104", "step": 1520, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:57.078530", "step": 1520, "epoch": 3 }, { "type": "loss", "content": 0.0003687713178806007, "timestamp": "2025-09-30 22:07:57.087089", "step": 1521, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:07:57.765660", "step": 1521, "epoch": 3 }, { "type": "pplx", "content": 108316397.555252, "timestamp": "2025-09-30 22:07:57.773309", "step": 1521, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:07:57.809493", "step": 1521, "epoch": 3 }, { "type": "loss", "content": 7.841020851628855e-05, "timestamp": "2025-09-30 22:07:57.819649", "step": 1522, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:57.856081", "step": 1522, "epoch": 3 }, { "type": "loss", "content": 0.00047487596748396754, "timestamp": "2025-09-30 22:07:57.863331", "step": 1523, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:07:57.903662", "step": 1523, "epoch": 3 }, { "type": "loss", "content": 0.0012490659719333053, "timestamp": "2025-09-30 22:07:57.932164", "step": 1524, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:57.967105", "step": 1524, "epoch": 3 }, { "type": "loss", "content": 0.0005907863960601389, "timestamp": "2025-09-30 22:07:57.972722", "step": 1525, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:58.010496", "step": 1525, "epoch": 3 }, { "type": "loss", "content": 0.0002828697324730456, "timestamp": "2025-09-30 22:07:58.014933", "step": 1526, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:58.047621", "step": 1526, "epoch": 3 }, { "type": "loss", "content": 0.0006750501925125718, "timestamp": "2025-09-30 22:07:58.053908", "step": 1527, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:58.098564", "step": 1527, "epoch": 3 }, { "type": "loss", "content": 0.0007431631092913449, "timestamp": "2025-09-30 22:07:58.128539", "step": 1528, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:07:58.168866", "step": 1528, "epoch": 3 }, { "type": "loss", "content": 0.00021667192049790174, "timestamp": "2025-09-30 22:07:58.176704", "step": 1529, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-30 22:07:58.212352", "step": 1529, "epoch": 3 }, { "type": "loss", "content": 0.0002985773899126798, "timestamp": "2025-09-30 22:07:58.224713", "step": 1530, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:58.262613", "step": 1530, "epoch": 3 }, { "type": "loss", "content": 0.00027525509358383715, "timestamp": "2025-09-30 22:07:58.269304", "step": 1531, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:58.305293", "step": 1531, "epoch": 3 }, { "type": "loss", "content": 0.0006788480677641928, "timestamp": "2025-09-30 22:07:58.334872", "step": 1532, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:58.374429", "step": 1532, "epoch": 3 }, { "type": "loss", "content": 0.00018581384210847318, "timestamp": "2025-09-30 22:07:58.381574", "step": 1533, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:58.419942", "step": 1533, "epoch": 3 }, { "type": "loss", "content": 0.00016812498506624252, "timestamp": "2025-09-30 22:07:58.432510", "step": 1534, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:58.471750", "step": 1534, "epoch": 3 }, { "type": "loss", "content": 0.0006146311643533409, "timestamp": "2025-09-30 22:07:58.476170", "step": 1535, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:58.523246", "step": 1535, "epoch": 3 }, { "type": "loss", "content": 0.00039723937516100705, "timestamp": "2025-09-30 22:07:58.553914", "step": 1536, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:58.586459", "step": 1536, "epoch": 3 }, { "type": "loss", "content": 0.001720373285934329, "timestamp": "2025-09-30 22:07:58.592902", "step": 1537, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:58.628973", "step": 1537, "epoch": 3 }, { "type": "loss", "content": 0.0002852992038242519, "timestamp": "2025-09-30 22:07:58.633065", "step": 1538, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:58.674476", "step": 1538, "epoch": 3 }, { "type": "loss", "content": 0.0007187298615463078, "timestamp": "2025-09-30 22:07:58.681138", "step": 1539, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:58.720500", "step": 1539, "epoch": 3 }, { "type": "loss", "content": 0.0001630386832403019, "timestamp": "2025-09-30 22:07:58.744828", "step": 1540, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:58.777185", "step": 1540, "epoch": 3 }, { "type": "loss", "content": 0.00010030083649326116, "timestamp": "2025-09-30 22:07:58.781824", "step": 1541, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:07:58.827357", "step": 1541, "epoch": 3 }, { "type": "loss", "content": 0.0005012182518839836, "timestamp": "2025-09-30 22:07:58.829764", "step": 1542, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:58.862190", "step": 1542, "epoch": 3 }, { "type": "loss", "content": 0.00025205491692759097, "timestamp": "2025-09-30 22:07:58.864801", "step": 1543, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:58.913324", "step": 1543, "epoch": 3 }, { "type": "loss", "content": 0.00020313379354774952, "timestamp": "2025-09-30 22:07:58.940418", "step": 1544, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:58.981896", "step": 1544, "epoch": 3 }, { "type": "loss", "content": 4.318432183936238e-05, "timestamp": "2025-09-30 22:07:58.986189", "step": 1545, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:07:59.029376", "step": 1545, "epoch": 3 }, { "type": "loss", "content": 0.00014498786185868084, "timestamp": "2025-09-30 22:07:59.036725", "step": 1546, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:59.074469", "step": 1546, "epoch": 3 }, { "type": "loss", "content": 0.0003494999255053699, "timestamp": "2025-09-30 22:07:59.079065", "step": 1547, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:59.111455", "step": 1547, "epoch": 3 }, { "type": "loss", "content": 0.0018302559619769454, "timestamp": "2025-09-30 22:07:59.138392", "step": 1548, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:07:59.171635", "step": 1548, "epoch": 3 }, { "type": "loss", "content": 0.01349946204572916, "timestamp": "2025-09-30 22:07:59.175050", "step": 1549, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:07:59.219607", "step": 1549, "epoch": 3 }, { "type": "loss", "content": 0.00011048569285776466, "timestamp": "2025-09-30 22:07:59.229055", "step": 1550, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:59.275463", "step": 1550, "epoch": 3 }, { "type": "loss", "content": 0.00011431624443503097, "timestamp": "2025-09-30 22:07:59.279372", "step": 1551, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:59.321139", "step": 1551, "epoch": 3 }, { "type": "loss", "content": 6.0733193095074967e-05, "timestamp": "2025-09-30 22:07:59.349157", "step": 1552, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:59.390225", "step": 1552, "epoch": 3 }, { "type": "loss", "content": 0.0003296414215583354, "timestamp": "2025-09-30 22:07:59.394857", "step": 1553, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:59.427894", "step": 1553, "epoch": 3 }, { "type": "loss", "content": 0.000213971987250261, "timestamp": "2025-09-30 22:07:59.436213", "step": 1554, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:07:59.483135", "step": 1554, "epoch": 3 }, { "type": "loss", "content": 0.0001407633681083098, "timestamp": "2025-09-30 22:07:59.488822", "step": 1555, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:07:59.533912", "step": 1555, "epoch": 3 }, { "type": "loss", "content": 0.0003707043069880456, "timestamp": "2025-09-30 22:07:59.561812", "step": 1556, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:07:59.604114", "step": 1556, "epoch": 3 }, { "type": "loss", "content": 0.0002882958797272295, "timestamp": "2025-09-30 22:07:59.610152", "step": 1557, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:59.653434", "step": 1557, "epoch": 3 }, { "type": "loss", "content": 0.004305084235966206, "timestamp": "2025-09-30 22:07:59.660444", "step": 1558, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:07:59.698055", "step": 1558, "epoch": 3 }, { "type": "loss", "content": 0.0006565157091245055, "timestamp": "2025-09-30 22:07:59.705079", "step": 1559, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:07:59.738815", "step": 1559, "epoch": 3 }, { "type": "loss", "content": 0.00025418223231099546, "timestamp": "2025-09-30 22:07:59.766899", "step": 1560, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:08:00.508526", "step": 1560, "epoch": 3 }, { "type": "pplx", "content": 112198427.95504099, "timestamp": "2025-09-30 22:08:00.510974", "step": 1560, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:00.540871", "step": 1560, "epoch": 3 }, { "type": "loss", "content": 0.0014659542357549071, "timestamp": "2025-09-30 22:08:00.543889", "step": 1561, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:00.578213", "step": 1561, "epoch": 3 }, { "type": "loss", "content": 0.001355137093923986, "timestamp": "2025-09-30 22:08:00.582836", "step": 1562, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:00.615430", "step": 1562, "epoch": 3 }, { "type": "loss", "content": 0.0008022190886549652, "timestamp": "2025-09-30 22:08:00.617561", "step": 1563, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:00.652645", "step": 1563, "epoch": 3 }, { "type": "loss", "content": 0.05110324174165726, "timestamp": "2025-09-30 22:08:00.677735", "step": 1564, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:00.711624", "step": 1564, "epoch": 3 }, { "type": "loss", "content": 0.06121738627552986, "timestamp": "2025-09-30 22:08:00.714775", "step": 1565, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:00.763825", "step": 1565, "epoch": 3 }, { "type": "loss", "content": 0.0002549318887759, "timestamp": "2025-09-30 22:08:00.765929", "step": 1566, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:00.809761", "step": 1566, "epoch": 3 }, { "type": "loss", "content": 0.0022373481187969446, "timestamp": "2025-09-30 22:08:00.812029", "step": 1567, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:00.854177", "step": 1567, "epoch": 3 }, { "type": "loss", "content": 0.00020157112157903612, "timestamp": "2025-09-30 22:08:00.882886", "step": 1568, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:00.920926", "step": 1568, "epoch": 3 }, { "type": "loss", "content": 0.004094513598829508, "timestamp": "2025-09-30 22:08:00.923616", "step": 1569, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:00.965875", "step": 1569, "epoch": 3 }, { "type": "loss", "content": 0.00041190601768903434, "timestamp": "2025-09-30 22:08:00.970108", "step": 1570, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:01.011757", "step": 1570, "epoch": 3 }, { "type": "loss", "content": 6.220638169907033e-05, "timestamp": "2025-09-30 22:08:01.016471", "step": 1571, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:01.053329", "step": 1571, "epoch": 3 }, { "type": "loss", "content": 3.0949806387070566e-05, "timestamp": "2025-09-30 22:08:01.078440", "step": 1572, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:01.120823", "step": 1572, "epoch": 3 }, { "type": "loss", "content": 7.183482375694439e-05, "timestamp": "2025-09-30 22:08:01.123523", "step": 1573, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:01.160656", "step": 1573, "epoch": 3 }, { "type": "loss", "content": 6.708560977131128e-05, "timestamp": "2025-09-30 22:08:01.163300", "step": 1574, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:01.202125", "step": 1574, "epoch": 3 }, { "type": "loss", "content": 6.0947448218939826e-05, "timestamp": "2025-09-30 22:08:01.209942", "step": 1575, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:01.244074", "step": 1575, "epoch": 3 }, { "type": "loss", "content": 0.00033145371708087623, "timestamp": "2025-09-30 22:08:01.270433", "step": 1576, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:01.308732", "step": 1576, "epoch": 3 }, { "type": "loss", "content": 0.00012974298442713916, "timestamp": "2025-09-30 22:08:01.320260", "step": 1577, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:01.361271", "step": 1577, "epoch": 3 }, { "type": "loss", "content": 0.001361784408800304, "timestamp": "2025-09-30 22:08:01.363606", "step": 1578, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:01.404572", "step": 1578, "epoch": 3 }, { "type": "loss", "content": 7.593195186927915e-05, "timestamp": "2025-09-30 22:08:01.413470", "step": 1579, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:01.448140", "step": 1579, "epoch": 3 }, { "type": "loss", "content": 0.0013231473276391625, "timestamp": "2025-09-30 22:08:01.471885", "step": 1580, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:01.503369", "step": 1580, "epoch": 3 }, { "type": "loss", "content": 0.0015925122424960136, "timestamp": "2025-09-30 22:08:01.507497", "step": 1581, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:01.538654", "step": 1581, "epoch": 3 }, { "type": "loss", "content": 0.0001307614438701421, "timestamp": "2025-09-30 22:08:01.541790", "step": 1582, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:01.576886", "step": 1582, "epoch": 3 }, { "type": "loss", "content": 0.0006219060160219669, "timestamp": "2025-09-30 22:08:01.581903", "step": 1583, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:01.615786", "step": 1583, "epoch": 3 }, { "type": "loss", "content": 0.00012529719970189035, "timestamp": "2025-09-30 22:08:01.642023", "step": 1584, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:01.676258", "step": 1584, "epoch": 3 }, { "type": "loss", "content": 0.0003298810333944857, "timestamp": "2025-09-30 22:08:01.679148", "step": 1585, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:01.713366", "step": 1585, "epoch": 3 }, { "type": "loss", "content": 0.00010575191117823124, "timestamp": "2025-09-30 22:08:01.718128", "step": 1586, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:01.753223", "step": 1586, "epoch": 3 }, { "type": "loss", "content": 4.924646418658085e-05, "timestamp": "2025-09-30 22:08:01.760425", "step": 1587, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:01.794419", "step": 1587, "epoch": 3 }, { "type": "loss", "content": 0.0003786073357332498, "timestamp": "2025-09-30 22:08:01.820784", "step": 1588, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:01.859129", "step": 1588, "epoch": 3 }, { "type": "loss", "content": 0.001415244652889669, "timestamp": "2025-09-30 22:08:01.864451", "step": 1589, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:01.897159", "step": 1589, "epoch": 3 }, { "type": "loss", "content": 0.0006168781546875834, "timestamp": "2025-09-30 22:08:01.902324", "step": 1590, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:01.934825", "step": 1590, "epoch": 3 }, { "type": "loss", "content": 5.856368807144463e-05, "timestamp": "2025-09-30 22:08:01.941885", "step": 1591, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:01.978997", "step": 1591, "epoch": 3 }, { "type": "loss", "content": 0.0019562358502298594, "timestamp": "2025-09-30 22:08:02.007784", "step": 1592, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:02.043044", "step": 1592, "epoch": 3 }, { "type": "loss", "content": 0.0004735655675176531, "timestamp": "2025-09-30 22:08:02.045587", "step": 1593, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:02.081380", "step": 1593, "epoch": 3 }, { "type": "loss", "content": 0.00011099340918008238, "timestamp": "2025-09-30 22:08:02.087946", "step": 1594, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:02.125184", "step": 1594, "epoch": 3 }, { "type": "loss", "content": 0.00012295310443732888, "timestamp": "2025-09-30 22:08:02.129602", "step": 1595, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:02.166017", "step": 1595, "epoch": 3 }, { "type": "loss", "content": 0.0004471055290196091, "timestamp": "2025-09-30 22:08:02.194200", "step": 1596, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:02.235368", "step": 1596, "epoch": 3 }, { "type": "loss", "content": 0.00020820485951844603, "timestamp": "2025-09-30 22:08:02.241209", "step": 1597, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:02.279475", "step": 1597, "epoch": 3 }, { "type": "loss", "content": 0.0005493653588928282, "timestamp": "2025-09-30 22:08:02.289794", "step": 1598, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:02.328001", "step": 1598, "epoch": 3 }, { "type": "loss", "content": 0.00024865224258974195, "timestamp": "2025-09-30 22:08:02.333983", "step": 1599, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:08:03.013747", "step": 1599, "epoch": 3 }, { "type": "pplx", "content": 104521737.10689187, "timestamp": "2025-09-30 22:08:03.019853", "step": 1599, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:03.053604", "step": 1599, "epoch": 3 }, { "type": "loss", "content": 8.496097143506631e-05, "timestamp": "2025-09-30 22:08:03.079978", "step": 1600, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:03.113950", "step": 1600, "epoch": 3 }, { "type": "loss", "content": 0.0004024894442409277, "timestamp": "2025-09-30 22:08:03.120001", "step": 1601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:03.160823", "step": 1601, "epoch": 3 }, { "type": "loss", "content": 0.002289453987032175, "timestamp": "2025-09-30 22:08:03.169481", "step": 1602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:03.204493", "step": 1602, "epoch": 3 }, { "type": "loss", "content": 0.0002782022929750383, "timestamp": "2025-09-30 22:08:03.207404", "step": 1603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:03.246178", "step": 1603, "epoch": 3 }, { "type": "loss", "content": 0.00017332390416413546, "timestamp": "2025-09-30 22:08:03.274045", "step": 1604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:03.314144", "step": 1604, "epoch": 3 }, { "type": "loss", "content": 0.00021450161875691265, "timestamp": "2025-09-30 22:08:03.319232", "step": 1605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:08:03.359821", "step": 1605, "epoch": 3 }, { "type": "loss", "content": 3.722803739947267e-05, "timestamp": "2025-09-30 22:08:03.367781", "step": 1606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:03.398632", "step": 1606, "epoch": 3 }, { "type": "loss", "content": 0.00020089758618269116, "timestamp": "2025-09-30 22:08:03.400602", "step": 1607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:03.434721", "step": 1607, "epoch": 3 }, { "type": "loss", "content": 0.0003215324250049889, "timestamp": "2025-09-30 22:08:03.460118", "step": 1608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:03.491394", "step": 1608, "epoch": 3 }, { "type": "loss", "content": 0.0012815693626180291, "timestamp": "2025-09-30 22:08:03.493533", "step": 1609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:03.524683", "step": 1609, "epoch": 3 }, { "type": "loss", "content": 0.004723010119050741, "timestamp": "2025-09-30 22:08:03.526861", "step": 1610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:03.557305", "step": 1610, "epoch": 3 }, { "type": "loss", "content": 0.00014322168135549873, "timestamp": "2025-09-30 22:08:03.559751", "step": 1611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:03.592700", "step": 1611, "epoch": 3 }, { "type": "loss", "content": 9.33581031858921e-05, "timestamp": "2025-09-30 22:08:03.616537", "step": 1612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:03.647387", "step": 1612, "epoch": 3 }, { "type": "loss", "content": 0.00022943881049286574, "timestamp": "2025-09-30 22:08:03.649671", "step": 1613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:03.686610", "step": 1613, "epoch": 3 }, { "type": "loss", "content": 0.008927569724619389, "timestamp": "2025-09-30 22:08:03.691136", "step": 1614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:03.730745", "step": 1614, "epoch": 3 }, { "type": "loss", "content": 0.00228672637604177, "timestamp": "2025-09-30 22:08:03.733198", "step": 1615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:03.772058", "step": 1615, "epoch": 3 }, { "type": "loss", "content": 0.00068600446684286, "timestamp": "2025-09-30 22:08:03.797307", "step": 1616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:03.830402", "step": 1616, "epoch": 3 }, { "type": "loss", "content": 0.002715339185670018, "timestamp": "2025-09-30 22:08:03.832684", "step": 1617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:03.863999", "step": 1617, "epoch": 3 }, { "type": "loss", "content": 3.630607534432784e-05, "timestamp": "2025-09-30 22:08:03.866297", "step": 1618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:03.897609", "step": 1618, "epoch": 3 }, { "type": "loss", "content": 0.0015939169097691774, "timestamp": "2025-09-30 22:08:03.899791", "step": 1619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:03.933202", "step": 1619, "epoch": 3 }, { "type": "loss", "content": 3.691700840136036e-05, "timestamp": "2025-09-30 22:08:03.957020", "step": 1620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:03.994494", "step": 1620, "epoch": 3 }, { "type": "loss", "content": 0.0012790290638804436, "timestamp": "2025-09-30 22:08:03.996772", "step": 1621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:04.030003", "step": 1621, "epoch": 3 }, { "type": "loss", "content": 0.005167053546756506, "timestamp": "2025-09-30 22:08:04.037126", "step": 1622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:04.075078", "step": 1622, "epoch": 3 }, { "type": "loss", "content": 0.00023754978610668331, "timestamp": "2025-09-30 22:08:04.079770", "step": 1623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:04.114642", "step": 1623, "epoch": 3 }, { "type": "loss", "content": 0.0023776155430823565, "timestamp": "2025-09-30 22:08:04.143356", "step": 1624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:04.178019", "step": 1624, "epoch": 3 }, { "type": "loss", "content": 0.020028727129101753, "timestamp": "2025-09-30 22:08:04.180324", "step": 1625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:04.215444", "step": 1625, "epoch": 3 }, { "type": "loss", "content": 0.002531526843085885, "timestamp": "2025-09-30 22:08:04.217576", "step": 1626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:04.257311", "step": 1626, "epoch": 3 }, { "type": "loss", "content": 0.0007390738464891911, "timestamp": "2025-09-30 22:08:04.259321", "step": 1627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:04.295905", "step": 1627, "epoch": 3 }, { "type": "loss", "content": 0.0002480483672115952, "timestamp": "2025-09-30 22:08:04.323889", "step": 1628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:04.357678", "step": 1628, "epoch": 3 }, { "type": "loss", "content": 0.00014147233741823584, "timestamp": "2025-09-30 22:08:04.362330", "step": 1629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:04.395767", "step": 1629, "epoch": 3 }, { "type": "loss", "content": 0.00017867713177111, "timestamp": "2025-09-30 22:08:04.402891", "step": 1630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:04.440235", "step": 1630, "epoch": 3 }, { "type": "loss", "content": 0.00012707387213595212, "timestamp": "2025-09-30 22:08:04.442387", "step": 1631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:04.478325", "step": 1631, "epoch": 3 }, { "type": "loss", "content": 0.000509004108607769, "timestamp": "2025-09-30 22:08:04.506687", "step": 1632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:04.543312", "step": 1632, "epoch": 3 }, { "type": "loss", "content": 6.3887688156683e-05, "timestamp": "2025-09-30 22:08:04.545290", "step": 1633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:04.589706", "step": 1633, "epoch": 3 }, { "type": "loss", "content": 0.00019534204329829663, "timestamp": "2025-09-30 22:08:04.591941", "step": 1634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:04.623935", "step": 1634, "epoch": 3 }, { "type": "loss", "content": 9.548116941004992e-05, "timestamp": "2025-09-30 22:08:04.626038", "step": 1635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:04.674140", "step": 1635, "epoch": 3 }, { "type": "loss", "content": 0.002590916818007827, "timestamp": "2025-09-30 22:08:04.702182", "step": 1636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:04.734293", "step": 1636, "epoch": 3 }, { "type": "loss", "content": 0.0026767959352582693, "timestamp": "2025-09-30 22:08:04.736317", "step": 1637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:04.770294", "step": 1637, "epoch": 3 }, { "type": "loss", "content": 8.954613440437242e-05, "timestamp": "2025-09-30 22:08:04.772376", "step": 1638, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:08:05.452373", "step": 1638, "epoch": 3 }, { "type": "pplx", "content": 115538961.0174772, "timestamp": "2025-09-30 22:08:05.454369", "step": 1638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:05.483522", "step": 1638, "epoch": 3 }, { "type": "loss", "content": 3.402471338631585e-05, "timestamp": "2025-09-30 22:08:05.490804", "step": 1639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:05.525581", "step": 1639, "epoch": 3 }, { "type": "loss", "content": 0.004754878580570221, "timestamp": "2025-09-30 22:08:05.549459", "step": 1640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:05.581943", "step": 1640, "epoch": 3 }, { "type": "loss", "content": 0.003890208899974823, "timestamp": "2025-09-30 22:08:05.583987", "step": 1641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:08:05.617330", "step": 1641, "epoch": 3 }, { "type": "loss", "content": 5.600903750746511e-05, "timestamp": "2025-09-30 22:08:05.627829", "step": 1642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:05.659112", "step": 1642, "epoch": 3 }, { "type": "loss", "content": 0.0029056521598249674, "timestamp": "2025-09-30 22:08:05.661117", "step": 1643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:05.692710", "step": 1643, "epoch": 3 }, { "type": "loss", "content": 0.0007899158517830074, "timestamp": "2025-09-30 22:08:05.717988", "step": 1644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:05.763063", "step": 1644, "epoch": 3 }, { "type": "loss", "content": 0.0004726467013824731, "timestamp": "2025-09-30 22:08:05.768454", "step": 1645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:05.803704", "step": 1645, "epoch": 3 }, { "type": "loss", "content": 0.00011156607797602192, "timestamp": "2025-09-30 22:08:05.805874", "step": 1646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:05.843267", "step": 1646, "epoch": 3 }, { "type": "loss", "content": 0.0001994954509427771, "timestamp": "2025-09-30 22:08:05.845573", "step": 1647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:05.880475", "step": 1647, "epoch": 3 }, { "type": "loss", "content": 0.0001336904097115621, "timestamp": "2025-09-30 22:08:05.909260", "step": 1648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:05.940343", "step": 1648, "epoch": 3 }, { "type": "loss", "content": 0.00022927757527213544, "timestamp": "2025-09-30 22:08:05.945564", "step": 1649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:05.981251", "step": 1649, "epoch": 3 }, { "type": "loss", "content": 5.889233580091968e-05, "timestamp": "2025-09-30 22:08:05.984116", "step": 1650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:08:06.016082", "step": 1650, "epoch": 3 }, { "type": "loss", "content": 6.961514009162784e-05, "timestamp": "2025-09-30 22:08:06.026407", "step": 1651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:06.059199", "step": 1651, "epoch": 3 }, { "type": "loss", "content": 0.00011490475299069658, "timestamp": "2025-09-30 22:08:06.084648", "step": 1652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:06.122253", "step": 1652, "epoch": 3 }, { "type": "loss", "content": 4.839776738663204e-05, "timestamp": "2025-09-30 22:08:06.127516", "step": 1653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:06.160802", "step": 1653, "epoch": 3 }, { "type": "loss", "content": 2.3726159270154312e-05, "timestamp": "2025-09-30 22:08:06.163020", "step": 1654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:06.205253", "step": 1654, "epoch": 3 }, { "type": "loss", "content": 0.00011560900748008862, "timestamp": "2025-09-30 22:08:06.208040", "step": 1655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:06.239929", "step": 1655, "epoch": 3 }, { "type": "loss", "content": 7.772482058499008e-05, "timestamp": "2025-09-30 22:08:06.268057", "step": 1656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:06.303065", "step": 1656, "epoch": 3 }, { "type": "loss", "content": 0.0005724158836528659, "timestamp": "2025-09-30 22:08:06.305384", "step": 1657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:06.338520", "step": 1657, "epoch": 3 }, { "type": "loss", "content": 3.609975101426244e-05, "timestamp": "2025-09-30 22:08:06.340509", "step": 1658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:06.375056", "step": 1658, "epoch": 3 }, { "type": "loss", "content": 0.00023306449293158948, "timestamp": "2025-09-30 22:08:06.378031", "step": 1659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:06.409148", "step": 1659, "epoch": 3 }, { "type": "loss", "content": 0.00013319592108018696, "timestamp": "2025-09-30 22:08:06.435575", "step": 1660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:06.467952", "step": 1660, "epoch": 3 }, { "type": "loss", "content": 0.0014770907582715154, "timestamp": "2025-09-30 22:08:06.472640", "step": 1661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:06.506661", "step": 1661, "epoch": 3 }, { "type": "loss", "content": 8.019845699891448e-05, "timestamp": "2025-09-30 22:08:06.509503", "step": 1662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:06.545897", "step": 1662, "epoch": 3 }, { "type": "loss", "content": 4.1435498133068904e-05, "timestamp": "2025-09-30 22:08:06.553635", "step": 1663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:06.588194", "step": 1663, "epoch": 3 }, { "type": "loss", "content": 0.0002740359923336655, "timestamp": "2025-09-30 22:08:06.613532", "step": 1664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 336 ], "flops": 9966940982208 }, "timestamp": "2025-09-30 22:08:06.647439", "step": 1664, "epoch": 3 }, { "type": "loss", "content": 6.516856956295669e-05, "timestamp": "2025-09-30 22:08:06.660143", "step": 1665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:06.691015", "step": 1665, "epoch": 3 }, { "type": "loss", "content": 0.008054564706981182, "timestamp": "2025-09-30 22:08:06.693141", "step": 1666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:06.724794", "step": 1666, "epoch": 3 }, { "type": "loss", "content": 5.055733709014021e-05, "timestamp": "2025-09-30 22:08:06.727106", "step": 1667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:06.768751", "step": 1667, "epoch": 3 }, { "type": "loss", "content": 0.0001062467708834447, "timestamp": "2025-09-30 22:08:06.792145", "step": 1668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:06.824186", "step": 1668, "epoch": 3 }, { "type": "loss", "content": 2.5943687433027662e-05, "timestamp": "2025-09-30 22:08:06.829545", "step": 1669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:06.864663", "step": 1669, "epoch": 3 }, { "type": "loss", "content": 5.616279304376803e-05, "timestamp": "2025-09-30 22:08:06.867073", "step": 1670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:06.900297", "step": 1670, "epoch": 3 }, { "type": "loss", "content": 4.885617090621963e-05, "timestamp": "2025-09-30 22:08:06.902554", "step": 1671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:06.936246", "step": 1671, "epoch": 3 }, { "type": "loss", "content": 4.1574046917958185e-05, "timestamp": "2025-09-30 22:08:06.961348", "step": 1672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:06.997844", "step": 1672, "epoch": 3 }, { "type": "loss", "content": 0.019031139090657234, "timestamp": "2025-09-30 22:08:07.000638", "step": 1673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:07.042395", "step": 1673, "epoch": 3 }, { "type": "loss", "content": 0.00019428497762419283, "timestamp": "2025-09-30 22:08:07.045189", "step": 1674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:07.076829", "step": 1674, "epoch": 3 }, { "type": "loss", "content": 0.0002930422779172659, "timestamp": "2025-09-30 22:08:07.079216", "step": 1675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:07.112571", "step": 1675, "epoch": 3 }, { "type": "loss", "content": 0.0008218813454732299, "timestamp": "2025-09-30 22:08:07.137668", "step": 1676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:07.171638", "step": 1676, "epoch": 3 }, { "type": "loss", "content": 2.3237113055074587e-05, "timestamp": "2025-09-30 22:08:07.177005", "step": 1677, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:08:07.816204", "step": 1677, "epoch": 3 }, { "type": "pplx", "content": 121808560.6157007, "timestamp": "2025-09-30 22:08:07.818747", "step": 1677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:07.848201", "step": 1677, "epoch": 3 }, { "type": "loss", "content": 2.809312536555808e-05, "timestamp": "2025-09-30 22:08:07.850567", "step": 1678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:07.882750", "step": 1678, "epoch": 3 }, { "type": "loss", "content": 0.00043884789920412004, "timestamp": "2025-09-30 22:08:07.889749", "step": 1679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:07.922034", "step": 1679, "epoch": 3 }, { "type": "loss", "content": 0.002086315304040909, "timestamp": "2025-09-30 22:08:07.947140", "step": 1680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:07.980525", "step": 1680, "epoch": 3 }, { "type": "loss", "content": 4.865376467932947e-05, "timestamp": "2025-09-30 22:08:07.983369", "step": 1681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:08:08.017152", "step": 1681, "epoch": 3 }, { "type": "loss", "content": 7.374538836302236e-05, "timestamp": "2025-09-30 22:08:08.024990", "step": 1682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:08.056481", "step": 1682, "epoch": 3 }, { "type": "loss", "content": 6.192681757966056e-05, "timestamp": "2025-09-30 22:08:08.059948", "step": 1683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:08.092914", "step": 1683, "epoch": 3 }, { "type": "loss", "content": 0.0006117112934589386, "timestamp": "2025-09-30 22:08:08.121736", "step": 1684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:08.154336", "step": 1684, "epoch": 3 }, { "type": "loss", "content": 4.967437780578621e-05, "timestamp": "2025-09-30 22:08:08.157294", "step": 1685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:08.191086", "step": 1685, "epoch": 3 }, { "type": "loss", "content": 3.295835995231755e-05, "timestamp": "2025-09-30 22:08:08.193954", "step": 1686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:08.224463", "step": 1686, "epoch": 3 }, { "type": "loss", "content": 4.771046951645985e-05, "timestamp": "2025-09-30 22:08:08.231325", "step": 1687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:08.265360", "step": 1687, "epoch": 3 }, { "type": "loss", "content": 4.849955075769685e-05, "timestamp": "2025-09-30 22:08:08.290450", "step": 1688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:08.331749", "step": 1688, "epoch": 3 }, { "type": "loss", "content": 0.0001034624656313099, "timestamp": "2025-09-30 22:08:08.337217", "step": 1689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:08.369308", "step": 1689, "epoch": 3 }, { "type": "loss", "content": 0.000108941356302239, "timestamp": "2025-09-30 22:08:08.371908", "step": 1690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:08.405890", "step": 1690, "epoch": 3 }, { "type": "loss", "content": 4.94919549964834e-05, "timestamp": "2025-09-30 22:08:08.409277", "step": 1691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:08.445119", "step": 1691, "epoch": 3 }, { "type": "loss", "content": 3.778712198254652e-05, "timestamp": "2025-09-30 22:08:08.469009", "step": 1692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:08.500808", "step": 1692, "epoch": 3 }, { "type": "loss", "content": 7.626505248481408e-05, "timestamp": "2025-09-30 22:08:08.503411", "step": 1693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:08.534715", "step": 1693, "epoch": 3 }, { "type": "loss", "content": 2.672936716408003e-05, "timestamp": "2025-09-30 22:08:08.537958", "step": 1694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:08.570418", "step": 1694, "epoch": 3 }, { "type": "loss", "content": 0.005537017714232206, "timestamp": "2025-09-30 22:08:08.573775", "step": 1695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:08.606232", "step": 1695, "epoch": 3 }, { "type": "loss", "content": 0.0002917372912634164, "timestamp": "2025-09-30 22:08:08.631346", "step": 1696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:08.664470", "step": 1696, "epoch": 3 }, { "type": "loss", "content": 6.110074173193425e-05, "timestamp": "2025-09-30 22:08:08.667323", "step": 1697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:08.699099", "step": 1697, "epoch": 3 }, { "type": "loss", "content": 0.00011303651990601793, "timestamp": "2025-09-30 22:08:08.701837", "step": 1698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:08.732756", "step": 1698, "epoch": 3 }, { "type": "loss", "content": 0.0005966713069938123, "timestamp": "2025-09-30 22:08:08.737317", "step": 1699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:08.769689", "step": 1699, "epoch": 3 }, { "type": "loss", "content": 6.444390601245686e-05, "timestamp": "2025-09-30 22:08:08.801952", "step": 1700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:08.834298", "step": 1700, "epoch": 3 }, { "type": "loss", "content": 4.669316331273876e-05, "timestamp": "2025-09-30 22:08:08.836388", "step": 1701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:08.869567", "step": 1701, "epoch": 3 }, { "type": "loss", "content": 0.0003925747296307236, "timestamp": "2025-09-30 22:08:08.876623", "step": 1702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:08:08.907036", "step": 1702, "epoch": 3 }, { "type": "loss", "content": 5.464601417770609e-05, "timestamp": "2025-09-30 22:08:08.917280", "step": 1703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:08.949598", "step": 1703, "epoch": 3 }, { "type": "loss", "content": 0.0007480279309675097, "timestamp": "2025-09-30 22:08:08.977683", "step": 1704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:09.011058", "step": 1704, "epoch": 3 }, { "type": "loss", "content": 0.0002451858308631927, "timestamp": "2025-09-30 22:08:09.013383", "step": 1705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:09.046894", "step": 1705, "epoch": 3 }, { "type": "loss", "content": 0.01712205819785595, "timestamp": "2025-09-30 22:08:09.049384", "step": 1706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:09.081727", "step": 1706, "epoch": 3 }, { "type": "loss", "content": 4.979677760275081e-05, "timestamp": "2025-09-30 22:08:09.084051", "step": 1707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:09.114820", "step": 1707, "epoch": 3 }, { "type": "loss", "content": 0.00010694911907194182, "timestamp": "2025-09-30 22:08:09.138319", "step": 1708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:09.170189", "step": 1708, "epoch": 3 }, { "type": "loss", "content": 8.407200948568061e-05, "timestamp": "2025-09-30 22:08:09.173668", "step": 1709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:09.217356", "step": 1709, "epoch": 3 }, { "type": "loss", "content": 3.759035098482855e-05, "timestamp": "2025-09-30 22:08:09.219576", "step": 1710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:09.251614", "step": 1710, "epoch": 3 }, { "type": "loss", "content": 6.0760878113796934e-05, "timestamp": "2025-09-30 22:08:09.256141", "step": 1711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:09.288638", "step": 1711, "epoch": 3 }, { "type": "loss", "content": 0.0006046485505066812, "timestamp": "2025-09-30 22:08:09.312368", "step": 1712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:09.343502", "step": 1712, "epoch": 3 }, { "type": "loss", "content": 4.117127173230983e-05, "timestamp": "2025-09-30 22:08:09.345604", "step": 1713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:09.383976", "step": 1713, "epoch": 3 }, { "type": "loss", "content": 0.0038941497914493084, "timestamp": "2025-09-30 22:08:09.386523", "step": 1714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:09.416535", "step": 1714, "epoch": 3 }, { "type": "loss", "content": 4.572251418721862e-05, "timestamp": "2025-09-30 22:08:09.419070", "step": 1715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:09.450798", "step": 1715, "epoch": 3 }, { "type": "loss", "content": 4.344709304859862e-05, "timestamp": "2025-09-30 22:08:09.474705", "step": 1716, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:08:10.121434", "step": 1716, "epoch": 3 }, { "type": "pplx", "content": 123022155.31750192, "timestamp": "2025-09-30 22:08:10.123279", "step": 1716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:10.152016", "step": 1716, "epoch": 3 }, { "type": "loss", "content": 2.7246298486716114e-05, "timestamp": "2025-09-30 22:08:10.154210", "step": 1717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:10.187641", "step": 1717, "epoch": 3 }, { "type": "loss", "content": 2.945954656752292e-05, "timestamp": "2025-09-30 22:08:10.192406", "step": 1718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:10.230016", "step": 1718, "epoch": 3 }, { "type": "loss", "content": 0.0006682882667519152, "timestamp": "2025-09-30 22:08:10.237555", "step": 1719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:10.274085", "step": 1719, "epoch": 3 }, { "type": "loss", "content": 0.00010243523138342425, "timestamp": "2025-09-30 22:08:10.298031", "step": 1720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:10.332529", "step": 1720, "epoch": 3 }, { "type": "loss", "content": 3.174094672431238e-05, "timestamp": "2025-09-30 22:08:10.334866", "step": 1721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:10.369414", "step": 1721, "epoch": 3 }, { "type": "loss", "content": 6.566093361470848e-05, "timestamp": "2025-09-30 22:08:10.376545", "step": 1722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:10.420688", "step": 1722, "epoch": 3 }, { "type": "loss", "content": 0.00014871249732095748, "timestamp": "2025-09-30 22:08:10.423199", "step": 1723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:10.453615", "step": 1723, "epoch": 3 }, { "type": "loss", "content": 0.0001852501736721024, "timestamp": "2025-09-30 22:08:10.477119", "step": 1724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:10.507896", "step": 1724, "epoch": 3 }, { "type": "loss", "content": 0.005387753248214722, "timestamp": "2025-09-30 22:08:10.513194", "step": 1725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:10.548504", "step": 1725, "epoch": 3 }, { "type": "loss", "content": 0.00018650003767106682, "timestamp": "2025-09-30 22:08:10.550888", "step": 1726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:10.586529", "step": 1726, "epoch": 3 }, { "type": "loss", "content": 2.3841199435992166e-05, "timestamp": "2025-09-30 22:08:10.589189", "step": 1727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:10.620099", "step": 1727, "epoch": 3 }, { "type": "loss", "content": 3.871664375765249e-05, "timestamp": "2025-09-30 22:08:10.645377", "step": 1728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:10.677597", "step": 1728, "epoch": 3 }, { "type": "loss", "content": 0.0007445144583471119, "timestamp": "2025-09-30 22:08:10.679797", "step": 1729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:10.711443", "step": 1729, "epoch": 3 }, { "type": "loss", "content": 3.987590753240511e-05, "timestamp": "2025-09-30 22:08:10.713708", "step": 1730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-30 22:08:10.744341", "step": 1730, "epoch": 3 }, { "type": "loss", "content": 0.0006975015858188272, "timestamp": "2025-09-30 22:08:10.754725", "step": 1731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:10.785891", "step": 1731, "epoch": 3 }, { "type": "loss", "content": 3.186930189258419e-05, "timestamp": "2025-09-30 22:08:10.809266", "step": 1732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:10.839143", "step": 1732, "epoch": 3 }, { "type": "loss", "content": 2.39141645579366e-05, "timestamp": "2025-09-30 22:08:10.841347", "step": 1733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:10.872641", "step": 1733, "epoch": 3 }, { "type": "loss", "content": 0.0007975990301929414, "timestamp": "2025-09-30 22:08:10.878155", "step": 1734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:10.909577", "step": 1734, "epoch": 3 }, { "type": "loss", "content": 3.0157307264744304e-05, "timestamp": "2025-09-30 22:08:10.911579", "step": 1735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:10.943570", "step": 1735, "epoch": 3 }, { "type": "loss", "content": 0.00013178681547287852, "timestamp": "2025-09-30 22:08:10.968656", "step": 1736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:11.002059", "step": 1736, "epoch": 3 }, { "type": "loss", "content": 0.000744854100048542, "timestamp": "2025-09-30 22:08:11.004778", "step": 1737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:11.036810", "step": 1737, "epoch": 3 }, { "type": "loss", "content": 7.578790973639116e-05, "timestamp": "2025-09-30 22:08:11.039780", "step": 1738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:08:11.077610", "step": 1738, "epoch": 3 }, { "type": "loss", "content": 4.3506599467946216e-05, "timestamp": "2025-09-30 22:08:11.085666", "step": 1739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:11.117434", "step": 1739, "epoch": 3 }, { "type": "loss", "content": 6.423755257856101e-05, "timestamp": "2025-09-30 22:08:11.142553", "step": 1740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:11.176165", "step": 1740, "epoch": 3 }, { "type": "loss", "content": 0.00023928364680614322, "timestamp": "2025-09-30 22:08:11.178379", "step": 1741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:11.210418", "step": 1741, "epoch": 3 }, { "type": "loss", "content": 6.30137074040249e-05, "timestamp": "2025-09-30 22:08:11.214998", "step": 1742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:11.247480", "step": 1742, "epoch": 3 }, { "type": "loss", "content": 0.0006644020904786885, "timestamp": "2025-09-30 22:08:11.250348", "step": 1743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:08:11.282008", "step": 1743, "epoch": 3 }, { "type": "loss", "content": 8.710688416613266e-05, "timestamp": "2025-09-30 22:08:11.311018", "step": 1744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:11.341815", "step": 1744, "epoch": 3 }, { "type": "loss", "content": 0.0001336832356173545, "timestamp": "2025-09-30 22:08:11.344275", "step": 1745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:11.375695", "step": 1745, "epoch": 3 }, { "type": "loss", "content": 0.006218440365046263, "timestamp": "2025-09-30 22:08:11.382933", "step": 1746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:11.413325", "step": 1746, "epoch": 3 }, { "type": "loss", "content": 0.002578115090727806, "timestamp": "2025-09-30 22:08:11.415211", "step": 1747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:11.447475", "step": 1747, "epoch": 3 }, { "type": "loss", "content": 0.0001024999437504448, "timestamp": "2025-09-30 22:08:11.471344", "step": 1748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-30 22:08:11.502763", "step": 1748, "epoch": 3 }, { "type": "loss", "content": 6.375632074195892e-05, "timestamp": "2025-09-30 22:08:11.512589", "step": 1749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:11.543597", "step": 1749, "epoch": 3 }, { "type": "loss", "content": 3.2842282962519675e-05, "timestamp": "2025-09-30 22:08:11.547960", "step": 1750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:11.580397", "step": 1750, "epoch": 3 }, { "type": "loss", "content": 0.0002622344472911209, "timestamp": "2025-09-30 22:08:11.584669", "step": 1751, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:11.615644", "step": 1751, "epoch": 3 }, { "type": "loss", "content": 5.3559277148451656e-05, "timestamp": "2025-09-30 22:08:11.641124", "step": 1752, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:11.678427", "step": 1752, "epoch": 3 }, { "type": "loss", "content": 0.0020374690648168325, "timestamp": "2025-09-30 22:08:11.683692", "step": 1753, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:11.714588", "step": 1753, "epoch": 3 }, { "type": "loss", "content": 2.5645465939305723e-05, "timestamp": "2025-09-30 22:08:11.716983", "step": 1754, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:11.747409", "step": 1754, "epoch": 3 }, { "type": "loss", "content": 0.0005786855472251773, "timestamp": "2025-09-30 22:08:11.749624", "step": 1755, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:08:12.585912", "step": 1755, "epoch": 3 }, { "type": "pplx", "content": 123261309.76453374, "timestamp": "2025-09-30 22:08:12.587845", "step": 1755, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:12.617583", "step": 1755, "epoch": 3 }, { "type": "loss", "content": 3.3810920285759494e-05, "timestamp": "2025-09-30 22:08:12.646149", "step": 1756, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:12.685768", "step": 1756, "epoch": 3 }, { "type": "loss", "content": 0.0003127233940176666, "timestamp": "2025-09-30 22:08:12.688016", "step": 1757, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:12.720260", "step": 1757, "epoch": 3 }, { "type": "loss", "content": 0.00022420093591790646, "timestamp": "2025-09-30 22:08:12.724993", "step": 1758, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:08:12.757111", "step": 1758, "epoch": 3 }, { "type": "loss", "content": 0.00010123888932866976, "timestamp": "2025-09-30 22:08:12.765058", "step": 1759, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:12.800033", "step": 1759, "epoch": 3 }, { "type": "loss", "content": 7.35086141503416e-05, "timestamp": "2025-09-30 22:08:12.825405", "step": 1760, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:12.865360", "step": 1760, "epoch": 3 }, { "type": "loss", "content": 3.3173182600876316e-05, "timestamp": "2025-09-30 22:08:12.870461", "step": 1761, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 80 ], "flops": 2373281365952 }, "timestamp": "2025-09-30 22:08:12.904167", "step": 1761, "epoch": 3 }, { "type": "loss", "content": 4.140309101785533e-05, "timestamp": "2025-09-30 22:08:12.906124", "step": 1762, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:12.941723", "step": 1762, "epoch": 3 }, { "type": "loss", "content": 0.00019971518486272544, "timestamp": "2025-09-30 22:08:12.943757", "step": 1763, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:12.982711", "step": 1763, "epoch": 3 }, { "type": "loss", "content": 0.0005979278357699513, "timestamp": "2025-09-30 22:08:13.010824", "step": 1764, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:13.043654", "step": 1764, "epoch": 3 }, { "type": "loss", "content": 2.990214852616191e-05, "timestamp": "2025-09-30 22:08:13.045882", "step": 1765, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:13.077493", "step": 1765, "epoch": 3 }, { "type": "loss", "content": 0.0001495322067057714, "timestamp": "2025-09-30 22:08:13.079602", "step": 1766, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:13.119342", "step": 1766, "epoch": 3 }, { "type": "loss", "content": 0.0001800880127120763, "timestamp": "2025-09-30 22:08:13.122330", "step": 1767, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:13.174447", "step": 1767, "epoch": 3 }, { "type": "loss", "content": 0.00013568572467193007, "timestamp": "2025-09-30 22:08:13.197813", "step": 1768, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:13.236825", "step": 1768, "epoch": 3 }, { "type": "loss", "content": 0.0002641365572344512, "timestamp": "2025-09-30 22:08:13.239161", "step": 1769, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:13.277883", "step": 1769, "epoch": 3 }, { "type": "loss", "content": 0.018095744773745537, "timestamp": "2025-09-30 22:08:13.284886", "step": 1770, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:13.315789", "step": 1770, "epoch": 3 }, { "type": "loss", "content": 3.474125333013944e-05, "timestamp": "2025-09-30 22:08:13.317887", "step": 1771, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:13.350296", "step": 1771, "epoch": 3 }, { "type": "loss", "content": 5.6115230108844116e-05, "timestamp": "2025-09-30 22:08:13.375844", "step": 1772, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:13.420055", "step": 1772, "epoch": 3 }, { "type": "loss", "content": 3.4445827623130754e-05, "timestamp": "2025-09-30 22:08:13.425114", "step": 1773, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:13.459661", "step": 1773, "epoch": 3 }, { "type": "loss", "content": 7.985342381289229e-05, "timestamp": "2025-09-30 22:08:13.466773", "step": 1774, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:13.499306", "step": 1774, "epoch": 3 }, { "type": "loss", "content": 6.720200326526538e-05, "timestamp": "2025-09-30 22:08:13.501629", "step": 1775, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:13.539229", "step": 1775, "epoch": 3 }, { "type": "loss", "content": 3.170343188685365e-05, "timestamp": "2025-09-30 22:08:13.562723", "step": 1776, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:13.594808", "step": 1776, "epoch": 3 }, { "type": "loss", "content": 3.736267171916552e-05, "timestamp": "2025-09-30 22:08:13.597121", "step": 1777, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:13.638168", "step": 1777, "epoch": 3 }, { "type": "loss", "content": 3.414504317333922e-05, "timestamp": "2025-09-30 22:08:13.640483", "step": 1778, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:13.676919", "step": 1778, "epoch": 3 }, { "type": "loss", "content": 0.0002532243961468339, "timestamp": "2025-09-30 22:08:13.679667", "step": 1779, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:13.712968", "step": 1779, "epoch": 3 }, { "type": "loss", "content": 2.2894188077771105e-05, "timestamp": "2025-09-30 22:08:13.738132", "step": 1780, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:13.777012", "step": 1780, "epoch": 3 }, { "type": "loss", "content": 0.00030392984626814723, "timestamp": "2025-09-30 22:08:13.779101", "step": 1781, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:13.821398", "step": 1781, "epoch": 3 }, { "type": "loss", "content": 4.223351788823493e-05, "timestamp": "2025-09-30 22:08:13.825786", "step": 1782, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:13.865488", "step": 1782, "epoch": 3 }, { "type": "loss", "content": 1.772106406860985e-05, "timestamp": "2025-09-30 22:08:13.869031", "step": 1783, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:13.914524", "step": 1783, "epoch": 3 }, { "type": "loss", "content": 9.17787037906237e-05, "timestamp": "2025-09-30 22:08:13.939747", "step": 1784, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:13.979286", "step": 1784, "epoch": 3 }, { "type": "loss", "content": 0.00010865591320907697, "timestamp": "2025-09-30 22:08:13.983792", "step": 1785, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:14.021728", "step": 1785, "epoch": 3 }, { "type": "loss", "content": 3.610273779486306e-05, "timestamp": "2025-09-30 22:08:14.026121", "step": 1786, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:14.057167", "step": 1786, "epoch": 3 }, { "type": "loss", "content": 0.0003818259574472904, "timestamp": "2025-09-30 22:08:14.064336", "step": 1787, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:14.096565", "step": 1787, "epoch": 3 }, { "type": "loss", "content": 0.000534689286723733, "timestamp": "2025-09-30 22:08:14.120234", "step": 1788, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:14.161199", "step": 1788, "epoch": 3 }, { "type": "loss", "content": 0.0001270804350497201, "timestamp": "2025-09-30 22:08:14.163416", "step": 1789, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:14.201203", "step": 1789, "epoch": 3 }, { "type": "loss", "content": 0.00016834995767567307, "timestamp": "2025-09-30 22:08:14.204065", "step": 1790, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:14.252746", "step": 1790, "epoch": 3 }, { "type": "loss", "content": 3.9347731217276305e-05, "timestamp": "2025-09-30 22:08:14.254923", "step": 1791, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:14.293383", "step": 1791, "epoch": 3 }, { "type": "loss", "content": 3.699236913234927e-05, "timestamp": "2025-09-30 22:08:14.318959", "step": 1792, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:14.350695", "step": 1792, "epoch": 3 }, { "type": "loss", "content": 0.04252466931939125, "timestamp": "2025-09-30 22:08:14.355466", "step": 1793, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:14.389348", "step": 1793, "epoch": 3 }, { "type": "loss", "content": 0.003464995650574565, "timestamp": "2025-09-30 22:08:14.391366", "step": 1794, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:08:15.196274", "step": 1794, "epoch": 3 }, { "type": "pplx", "content": 120951084.46872666, "timestamp": "2025-09-30 22:08:15.201859", "step": 1794, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:15.252216", "step": 1794, "epoch": 3 }, { "type": "loss", "content": 2.1755853595095687e-05, "timestamp": "2025-09-30 22:08:15.257417", "step": 1795, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:15.302369", "step": 1795, "epoch": 3 }, { "type": "loss", "content": 5.0607039156602696e-05, "timestamp": "2025-09-30 22:08:15.340142", "step": 1796, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:15.389955", "step": 1796, "epoch": 3 }, { "type": "loss", "content": 0.00014252289838623255, "timestamp": "2025-09-30 22:08:15.394160", "step": 1797, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:15.428341", "step": 1797, "epoch": 3 }, { "type": "loss", "content": 4.311428710934706e-05, "timestamp": "2025-09-30 22:08:15.443274", "step": 1798, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:15.490139", "step": 1798, "epoch": 3 }, { "type": "loss", "content": 4.962775710737333e-05, "timestamp": "2025-09-30 22:08:15.504957", "step": 1799, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:15.541027", "step": 1799, "epoch": 3 }, { "type": "loss", "content": 7.520474173361436e-05, "timestamp": "2025-09-30 22:08:15.576857", "step": 1800, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:15.624365", "step": 1800, "epoch": 3 }, { "type": "loss", "content": 5.5460110161220655e-05, "timestamp": "2025-09-30 22:08:15.627503", "step": 1801, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:15.661828", "step": 1801, "epoch": 3 }, { "type": "loss", "content": 5.6858716561691836e-05, "timestamp": "2025-09-30 22:08:15.666471", "step": 1802, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:15.700968", "step": 1802, "epoch": 3 }, { "type": "loss", "content": 0.0005492048221640289, "timestamp": "2025-09-30 22:08:15.705724", "step": 1803, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:15.741314", "step": 1803, "epoch": 3 }, { "type": "loss", "content": 0.00012128066009609029, "timestamp": "2025-09-30 22:08:15.767648", "step": 1804, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:15.813858", "step": 1804, "epoch": 3 }, { "type": "loss", "content": 0.005712445825338364, "timestamp": "2025-09-30 22:08:15.816537", "step": 1805, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:15.849954", "step": 1805, "epoch": 3 }, { "type": "loss", "content": 0.0031685256399214268, "timestamp": "2025-09-30 22:08:15.854239", "step": 1806, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:15.887363", "step": 1806, "epoch": 3 }, { "type": "loss", "content": 2.441569813527167e-05, "timestamp": "2025-09-30 22:08:15.894288", "step": 1807, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:15.929882", "step": 1807, "epoch": 3 }, { "type": "loss", "content": 0.018170934170484543, "timestamp": "2025-09-30 22:08:15.957911", "step": 1808, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:15.998828", "step": 1808, "epoch": 3 }, { "type": "loss", "content": 2.65174348896835e-05, "timestamp": "2025-09-30 22:08:16.004518", "step": 1809, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:16.039339", "step": 1809, "epoch": 3 }, { "type": "loss", "content": 0.00011656737478915602, "timestamp": "2025-09-30 22:08:16.043396", "step": 1810, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:16.090084", "step": 1810, "epoch": 3 }, { "type": "loss", "content": 9.255034092348069e-05, "timestamp": "2025-09-30 22:08:16.097167", "step": 1811, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:16.130713", "step": 1811, "epoch": 3 }, { "type": "loss", "content": 7.447423558915034e-05, "timestamp": "2025-09-30 22:08:16.154971", "step": 1812, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:16.202653", "step": 1812, "epoch": 3 }, { "type": "loss", "content": 0.00020663348550442606, "timestamp": "2025-09-30 22:08:16.209417", "step": 1813, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:16.262784", "step": 1813, "epoch": 3 }, { "type": "loss", "content": 4.137990254093893e-05, "timestamp": "2025-09-30 22:08:16.269464", "step": 1814, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:16.315374", "step": 1814, "epoch": 3 }, { "type": "loss", "content": 0.0001488216657890007, "timestamp": "2025-09-30 22:08:16.317890", "step": 1815, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:16.356081", "step": 1815, "epoch": 3 }, { "type": "loss", "content": 9.109562961384654e-05, "timestamp": "2025-09-30 22:08:16.381173", "step": 1816, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:16.421995", "step": 1816, "epoch": 3 }, { "type": "loss", "content": 8.352736767847091e-05, "timestamp": "2025-09-30 22:08:16.426263", "step": 1817, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:16.466707", "step": 1817, "epoch": 3 }, { "type": "loss", "content": 2.977161784656346e-05, "timestamp": "2025-09-30 22:08:16.470890", "step": 1818, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:16.505994", "step": 1818, "epoch": 3 }, { "type": "loss", "content": 0.0001259616547031328, "timestamp": "2025-09-30 22:08:16.508545", "step": 1819, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:16.542695", "step": 1819, "epoch": 3 }, { "type": "loss", "content": 7.263097359100357e-05, "timestamp": "2025-09-30 22:08:16.570206", "step": 1820, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:16.608132", "step": 1820, "epoch": 3 }, { "type": "loss", "content": 8.77750717336312e-05, "timestamp": "2025-09-30 22:08:16.612891", "step": 1821, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:16.649859", "step": 1821, "epoch": 3 }, { "type": "loss", "content": 4.603018533089198e-05, "timestamp": "2025-09-30 22:08:16.652882", "step": 1822, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:16.688440", "step": 1822, "epoch": 3 }, { "type": "loss", "content": 0.0004101696249563247, "timestamp": "2025-09-30 22:08:16.691465", "step": 1823, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:16.726023", "step": 1823, "epoch": 3 }, { "type": "loss", "content": 8.775664173299447e-05, "timestamp": "2025-09-30 22:08:16.750840", "step": 1824, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:16.784702", "step": 1824, "epoch": 3 }, { "type": "loss", "content": 0.0006790863699279726, "timestamp": "2025-09-30 22:08:16.789397", "step": 1825, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:16.822832", "step": 1825, "epoch": 3 }, { "type": "loss", "content": 0.0005246041109785438, "timestamp": "2025-09-30 22:08:16.827178", "step": 1826, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:16.861392", "step": 1826, "epoch": 3 }, { "type": "loss", "content": 0.00011121475836262107, "timestamp": "2025-09-30 22:08:16.866013", "step": 1827, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-30 22:08:16.899442", "step": 1827, "epoch": 3 }, { "type": "loss", "content": 3.740801548701711e-05, "timestamp": "2025-09-30 22:08:16.928008", "step": 1828, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:16.963599", "step": 1828, "epoch": 3 }, { "type": "loss", "content": 0.00015850462659727782, "timestamp": "2025-09-30 22:08:16.966796", "step": 1829, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:17.005488", "step": 1829, "epoch": 3 }, { "type": "loss", "content": 4.086808621650562e-05, "timestamp": "2025-09-30 22:08:17.009996", "step": 1830, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:17.043938", "step": 1830, "epoch": 3 }, { "type": "loss", "content": 0.011201680637896061, "timestamp": "2025-09-30 22:08:17.046266", "step": 1831, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:17.082089", "step": 1831, "epoch": 3 }, { "type": "loss", "content": 8.120276470435783e-05, "timestamp": "2025-09-30 22:08:17.109518", "step": 1832, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:17.142928", "step": 1832, "epoch": 3 }, { "type": "loss", "content": 0.031549014151096344, "timestamp": "2025-09-30 22:08:17.146834", "step": 1833, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:08:17.894878", "step": 1833, "epoch": 3 }, { "type": "pplx", "content": 129112742.85206084, "timestamp": "2025-09-30 22:08:17.897575", "step": 1833, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:17.927259", "step": 1833, "epoch": 3 }, { "type": "loss", "content": 0.001220227568410337, "timestamp": "2025-09-30 22:08:17.929394", "step": 1834, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:17.971377", "step": 1834, "epoch": 3 }, { "type": "loss", "content": 0.00038733595283702016, "timestamp": "2025-09-30 22:08:17.975599", "step": 1835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:18.010328", "step": 1835, "epoch": 3 }, { "type": "loss", "content": 7.79922484070994e-05, "timestamp": "2025-09-30 22:08:18.034170", "step": 1836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:18.072204", "step": 1836, "epoch": 3 }, { "type": "loss", "content": 0.0011681662872433662, "timestamp": "2025-09-30 22:08:18.074439", "step": 1837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:18.121636", "step": 1837, "epoch": 3 }, { "type": "loss", "content": 2.5843259209068492e-05, "timestamp": "2025-09-30 22:08:18.124455", "step": 1838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:18.156820", "step": 1838, "epoch": 3 }, { "type": "loss", "content": 7.065497629810125e-05, "timestamp": "2025-09-30 22:08:18.159914", "step": 1839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:18.198259", "step": 1839, "epoch": 3 }, { "type": "loss", "content": 3.946414290112443e-05, "timestamp": "2025-09-30 22:08:18.222454", "step": 1840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:18.262948", "step": 1840, "epoch": 3 }, { "type": "loss", "content": 0.00021928052592556924, "timestamp": "2025-09-30 22:08:18.265079", "step": 1841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:18.297876", "step": 1841, "epoch": 3 }, { "type": "loss", "content": 0.00023360762861557305, "timestamp": "2025-09-30 22:08:18.299793", "step": 1842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:18.332105", "step": 1842, "epoch": 3 }, { "type": "loss", "content": 8.189439540728927e-05, "timestamp": "2025-09-30 22:08:18.334603", "step": 1843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:18.373101", "step": 1843, "epoch": 3 }, { "type": "loss", "content": 0.00012044947652611881, "timestamp": "2025-09-30 22:08:18.396643", "step": 1844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:18.437210", "step": 1844, "epoch": 3 }, { "type": "loss", "content": 0.00012855039676651359, "timestamp": "2025-09-30 22:08:18.439438", "step": 1845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-30 22:08:18.476909", "step": 1845, "epoch": 3 }, { "type": "loss", "content": 6.362907879520208e-05, "timestamp": "2025-09-30 22:08:18.485087", "step": 1846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:18.517382", "step": 1846, "epoch": 3 }, { "type": "loss", "content": 0.00016817821597214788, "timestamp": "2025-09-30 22:08:18.519888", "step": 1847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:18.558738", "step": 1847, "epoch": 3 }, { "type": "loss", "content": 0.0010157003998756409, "timestamp": "2025-09-30 22:08:18.586910", "step": 1848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:18.620972", "step": 1848, "epoch": 3 }, { "type": "loss", "content": 6.409407797036693e-05, "timestamp": "2025-09-30 22:08:18.624429", "step": 1849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:18.658250", "step": 1849, "epoch": 3 }, { "type": "loss", "content": 0.00013632301124744117, "timestamp": "2025-09-30 22:08:18.662956", "step": 1850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:18.695886", "step": 1850, "epoch": 3 }, { "type": "loss", "content": 0.01023082248866558, "timestamp": "2025-09-30 22:08:18.699569", "step": 1851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:18.733099", "step": 1851, "epoch": 3 }, { "type": "loss", "content": 0.008190372958779335, "timestamp": "2025-09-30 22:08:18.757608", "step": 1852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:08:18.790274", "step": 1852, "epoch": 3 }, { "type": "loss", "content": 0.00043314872891642153, "timestamp": "2025-09-30 22:08:18.796711", "step": 1853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:18.828987", "step": 1853, "epoch": 3 }, { "type": "loss", "content": 5.934419823461212e-05, "timestamp": "2025-09-30 22:08:18.832296", "step": 1854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:18.871379", "step": 1854, "epoch": 3 }, { "type": "loss", "content": 0.0002732239954639226, "timestamp": "2025-09-30 22:08:18.875970", "step": 1855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:08:18.910790", "step": 1855, "epoch": 3 }, { "type": "loss", "content": 4.7785659262444824e-05, "timestamp": "2025-09-30 22:08:18.935233", "step": 1856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:18.973620", "step": 1856, "epoch": 3 }, { "type": "loss", "content": 0.0002578292042016983, "timestamp": "2025-09-30 22:08:18.975975", "step": 1857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:19.016347", "step": 1857, "epoch": 3 }, { "type": "loss", "content": 0.00011785969400079921, "timestamp": "2025-09-30 22:08:19.023376", "step": 1858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:19.056914", "step": 1858, "epoch": 3 }, { "type": "loss", "content": 2.9439393983921036e-05, "timestamp": "2025-09-30 22:08:19.063975", "step": 1859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-30 22:08:19.096222", "step": 1859, "epoch": 3 }, { "type": "loss", "content": 0.0003162787761539221, "timestamp": "2025-09-30 22:08:19.119985", "step": 1860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:19.154241", "step": 1860, "epoch": 3 }, { "type": "loss", "content": 0.00015654593880753964, "timestamp": "2025-09-30 22:08:19.159920", "step": 1861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:08:19.192963", "step": 1861, "epoch": 3 }, { "type": "loss", "content": 0.0001386739022564143, "timestamp": "2025-09-30 22:08:19.197589", "step": 1862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-30 22:08:19.231116", "step": 1862, "epoch": 3 }, { "type": "loss", "content": 0.00011772316793212667, "timestamp": "2025-09-30 22:08:19.243666", "step": 1863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:08:19.290090", "step": 1863, "epoch": 3 }, { "type": "loss", "content": 0.04032059386372566, "timestamp": "2025-09-30 22:08:19.316983", "step": 1864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-30 22:08:19.358614", "step": 1864, "epoch": 3 }, { "type": "loss", "content": 0.07269955426454544, "timestamp": "2025-09-30 22:08:19.364288", "step": 1865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:19.406194", "step": 1865, "epoch": 3 }, { "type": "loss", "content": 0.00010652485798345879, "timestamp": "2025-09-30 22:08:19.411477", "step": 1866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:08:19.451042", "step": 1866, "epoch": 3 }, { "type": "loss", "content": 6.099715028540231e-05, "timestamp": "2025-09-30 22:08:19.457453", "step": 1867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-30 22:08:19.491157", "step": 1867, "epoch": 3 }, { "type": "loss", "content": 6.526331708300859e-05, "timestamp": "2025-09-30 22:08:19.518987", "step": 1868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 2, 192 ], "flops": 2847885110400 }, "timestamp": "2025-09-30 22:08:19.558052", "step": 1868, "epoch": 3 }, { "type": "loss", "content": 4.0736955270403996e-05, "timestamp": "2025-09-30 22:08:19.561941", "step": 1869, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-30 22:08:20.317601", "step": 1869, "epoch": 3 }, { "type": "pplx", "content": 114416360.51964663, "timestamp": "2025-09-30 22:08:20.323449", "step": 1869, "epoch": 3 }, { "type": "best_pplx", "content": 50567875.175425716, "timestamp": "2025-09-30 22:08:20.327822", "step": 1869, "epoch": 3 }, { "type": "best_step", "content": 39, "timestamp": "2025-09-30 22:08:20.331888", "step": 1869, "epoch": 3 }, { "type": "total_pplx_flops", "content": 5333250945655808, "timestamp": "2025-09-30 22:08:20.338020", "step": 1869, "epoch": 3 }, { "type": "total_train_flops", "content": 9219668431260864, "timestamp": "2025-09-30 22:08:20.343552", "step": 1869, "epoch": 3 } ], "best_evals": { "pplx": { "score": 50567875.175425716, "step": 39 }, "rougel": { "precision": 0.8158844765342961, "recall": 0.8158844765342961, "fmeasure": 0.8158844765342961 } } }