{ "training_args": { "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_cosmoqa_answer_generation_lora_v1", "overwrite_output_dir": false, "do_train": false, "do_eval": true, "do_predict": false, "eval_strategy": "steps", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 4, "eval_accumulation_steps": null, "eval_delay": 0, "torch_empty_cache_steps": null, "learning_rate": 2e-05, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": -1, "lr_scheduler_type": "linear", "lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_cosmoqa_answer_generation_lora_v1/runs/Sep05_08-48-54_gx28", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 20, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 40, "save_total_limit": null, "save_safetensors": true, "save_on_each_node": false, "save_only_model": false, "restore_callback_states_from_checkpoint": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 20, "dataloader_num_workers": 0, "dataloader_prefetch_factor": null, "past_index": -1, "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_cosmoqa_answer_generation_lora_v1", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": { "min_num_params": 0, "xla": false, "xla_fsdp_v2": false, "xla_fsdp_grad_ckpt": false }, "fsdp_transformer_layer_cls_to_wrap": null, "accelerator_config": { "split_batches": false, "dispatch_batches": null, "even_batches": true, "use_seedable_sampler": true, "non_blocking": false, "gradient_accumulation_kwargs": null }, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": [], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "dataloader_persistent_workers": false, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": "", "hub_private_repo": null, "hub_always_push": false, "gradient_checkpointing": false, "gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, "include_for_metrics": [], "eval_do_concat_batches": true, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": "", "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "include_tokens_per_second": false, "include_num_input_tokens_seen": false, "neftune_noise_alpha": null, "optim_target_modules": null, "batch_eval_metrics": false, "eval_on_start": false, "use_liger_kernel": false, "eval_use_gather_object": false, "average_tokens_across_devices": false }, "lora_config": { "task_type": "CAUSAL_LM", "peft_type": "LORA", "auto_mapping": null, "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", "revision": null, "inference_mode": false, "r": 16, "target_modules": [ "down_proj", "o_proj", "v_proj", "up_proj", "k_proj", "gate_proj", "q_proj" ], "exclude_modules": null, "lora_alpha": 16, "lora_dropout": 0.1, "fan_in_fan_out": false, "bias": "none", "use_rslora": true, "modules_to_save": null, "init_lora_weights": true, "layers_to_transform": null, "layers_pattern": null, "rank_pattern": {}, "alpha_pattern": {}, "megatron_config": null, "megatron_core": "megatron.core", "trainable_token_indices": null, "loftq_config": {}, "eva_config": null, "corda_config": null, "use_dora": false, "layer_replication": null, "runtime_config": { "ephemeral_gpu_offload": false }, "lora_bias": false }, "flops": { "eval": 49705559881469696, "train": 1.368968336766336e+16, "total": 6.339524324913306e+16 }, "total_energy": 20.575300000000002, "logs": [ { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:49:23.131069", "step": 0, "epoch": 0 }, { "type": "pplx", "content": 107.54594258685516, "timestamp": "2025-09-05 08:49:23.133261", "step": 0, "epoch": 0 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:23.415457", "step": 0, "epoch": 1 }, { "type": "loss", "content": 0.6169126629829407, "timestamp": "2025-09-05 08:49:23.417206", "step": 1, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:23.625279", "step": 1, "epoch": 1 }, { "type": "loss", "content": 0.7517166137695312, "timestamp": "2025-09-05 08:49:23.627202", "step": 2, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:23.802422", "step": 2, "epoch": 1 }, { "type": "loss", "content": 0.6235635280609131, "timestamp": "2025-09-05 08:49:23.804678", "step": 3, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:23.985359", "step": 3, "epoch": 1 }, { "type": "loss", "content": 0.8837859630584717, "timestamp": "2025-09-05 08:49:24.194974", "step": 4, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:24.346141", "step": 4, "epoch": 1 }, { "type": "loss", "content": 0.643877387046814, "timestamp": "2025-09-05 08:49:24.348074", "step": 5, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:24.522959", "step": 5, "epoch": 1 }, { "type": "loss", "content": 0.7255116105079651, "timestamp": "2025-09-05 08:49:24.524779", "step": 6, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:24.695679", "step": 6, "epoch": 1 }, { "type": "loss", "content": 0.5929327607154846, "timestamp": "2025-09-05 08:49:24.697536", "step": 7, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:24.876445", "step": 7, "epoch": 1 }, { "type": "loss", "content": 0.6998869776725769, "timestamp": "2025-09-05 08:49:24.892849", "step": 8, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:25.062476", "step": 8, "epoch": 1 }, { "type": "loss", "content": 0.7311207056045532, "timestamp": "2025-09-05 08:49:25.064482", "step": 9, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:25.235009", "step": 9, "epoch": 1 }, { "type": "loss", "content": 0.6596672534942627, "timestamp": "2025-09-05 08:49:25.238640", "step": 10, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:25.414055", "step": 10, "epoch": 1 }, { "type": "loss", "content": 0.8627897500991821, "timestamp": "2025-09-05 08:49:25.416048", "step": 11, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:25.585764", "step": 11, "epoch": 1 }, { "type": "loss", "content": 0.7733902335166931, "timestamp": "2025-09-05 08:49:25.602455", "step": 12, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:25.771572", "step": 12, "epoch": 1 }, { "type": "loss", "content": 0.6929965615272522, "timestamp": "2025-09-05 08:49:25.773466", "step": 13, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:25.950489", "step": 13, "epoch": 1 }, { "type": "loss", "content": 0.622175395488739, "timestamp": "2025-09-05 08:49:25.952905", "step": 14, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:26.125306", "step": 14, "epoch": 1 }, { "type": "loss", "content": 0.5498591065406799, "timestamp": "2025-09-05 08:49:26.127215", "step": 15, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:26.296529", "step": 15, "epoch": 1 }, { "type": "loss", "content": 0.5476340651512146, "timestamp": "2025-09-05 08:49:26.312315", "step": 16, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:26.474267", "step": 16, "epoch": 1 }, { "type": "loss", "content": 0.592532217502594, "timestamp": "2025-09-05 08:49:26.476179", "step": 17, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:26.645322", "step": 17, "epoch": 1 }, { "type": "loss", "content": 0.624656617641449, "timestamp": "2025-09-05 08:49:26.647578", "step": 18, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:26.819386", "step": 18, "epoch": 1 }, { "type": "loss", "content": 0.6042789816856384, "timestamp": "2025-09-05 08:49:26.821212", "step": 19, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:26.992800", "step": 19, "epoch": 1 }, { "type": "loss", "content": 0.54128497838974, "timestamp": "2025-09-05 08:49:27.006809", "step": 20, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:49:31.633614", "step": 20, "epoch": 1 }, { "type": "pplx", "content": 98.93025738022824, "timestamp": "2025-09-05 08:49:31.636504", "step": 20, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:31.771651", "step": 20, "epoch": 1 }, { "type": "loss", "content": 0.6219188570976257, "timestamp": "2025-09-05 08:49:31.773972", "step": 21, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:31.945046", "step": 21, "epoch": 1 }, { "type": "loss", "content": 0.6864243745803833, "timestamp": "2025-09-05 08:49:31.947077", "step": 22, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:32.119341", "step": 22, "epoch": 1 }, { "type": "loss", "content": 0.46079257130622864, "timestamp": "2025-09-05 08:49:32.121160", "step": 23, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:32.292480", "step": 23, "epoch": 1 }, { "type": "loss", "content": 0.6522855162620544, "timestamp": "2025-09-05 08:49:32.307856", "step": 24, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:32.472758", "step": 24, "epoch": 1 }, { "type": "loss", "content": 0.4554615318775177, "timestamp": "2025-09-05 08:49:32.474764", "step": 25, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:32.643264", "step": 25, "epoch": 1 }, { "type": "loss", "content": 0.530107319355011, "timestamp": "2025-09-05 08:49:32.645124", "step": 26, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:32.823718", "step": 26, "epoch": 1 }, { "type": "loss", "content": 0.5458198189735413, "timestamp": "2025-09-05 08:49:32.825511", "step": 27, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:32.995958", "step": 27, "epoch": 1 }, { "type": "loss", "content": 0.6185097098350525, "timestamp": "2025-09-05 08:49:33.012305", "step": 28, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:33.181153", "step": 28, "epoch": 1 }, { "type": "loss", "content": 0.48863735795021057, "timestamp": "2025-09-05 08:49:33.183739", "step": 29, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:33.360817", "step": 29, "epoch": 1 }, { "type": "loss", "content": 0.4185200035572052, "timestamp": "2025-09-05 08:49:33.362640", "step": 30, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:33.532666", "step": 30, "epoch": 1 }, { "type": "loss", "content": 0.5862270593643188, "timestamp": "2025-09-05 08:49:33.534345", "step": 31, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:33.709825", "step": 31, "epoch": 1 }, { "type": "loss", "content": 0.5197166800498962, "timestamp": "2025-09-05 08:49:33.724694", "step": 32, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:33.893918", "step": 32, "epoch": 1 }, { "type": "loss", "content": 0.4848283529281616, "timestamp": "2025-09-05 08:49:33.897285", "step": 33, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:34.070516", "step": 33, "epoch": 1 }, { "type": "loss", "content": 0.48005831241607666, "timestamp": "2025-09-05 08:49:34.072556", "step": 34, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:34.244353", "step": 34, "epoch": 1 }, { "type": "loss", "content": 0.5067130923271179, "timestamp": "2025-09-05 08:49:34.246260", "step": 35, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:34.423602", "step": 35, "epoch": 1 }, { "type": "loss", "content": 0.5473061800003052, "timestamp": "2025-09-05 08:49:34.438717", "step": 36, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:34.608695", "step": 36, "epoch": 1 }, { "type": "loss", "content": 0.5381602048873901, "timestamp": "2025-09-05 08:49:34.610450", "step": 37, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:34.781445", "step": 37, "epoch": 1 }, { "type": "loss", "content": 0.53970867395401, "timestamp": "2025-09-05 08:49:34.783221", "step": 38, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:34.961725", "step": 38, "epoch": 1 }, { "type": "loss", "content": 0.5283556580543518, "timestamp": "2025-09-05 08:49:34.963772", "step": 39, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:35.140557", "step": 39, "epoch": 1 }, { "type": "loss", "content": 0.4740239679813385, "timestamp": "2025-09-05 08:49:35.154682", "step": 40, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:49:39.785388", "step": 40, "epoch": 1 }, { "type": "pplx", "content": 87.66840919832546, "timestamp": "2025-09-05 08:49:39.787493", "step": 40, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 40", "timestamp": "2025-09-05 08:49:40.264753", "step": 40, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:40.437738", "step": 40, "epoch": 1 }, { "type": "loss", "content": 0.48637235164642334, "timestamp": "2025-09-05 08:49:40.439854", "step": 41, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:40.645274", "step": 41, "epoch": 1 }, { "type": "loss", "content": 0.5245602130889893, "timestamp": "2025-09-05 08:49:40.647013", "step": 42, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:40.852808", "step": 42, "epoch": 1 }, { "type": "loss", "content": 0.4956504702568054, "timestamp": "2025-09-05 08:49:40.854709", "step": 43, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:41.052545", "step": 43, "epoch": 1 }, { "type": "loss", "content": 0.49369287490844727, "timestamp": "2025-09-05 08:49:41.069275", "step": 44, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:41.266843", "step": 44, "epoch": 1 }, { "type": "loss", "content": 0.5377260446548462, "timestamp": "2025-09-05 08:49:41.268933", "step": 45, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:41.467042", "step": 45, "epoch": 1 }, { "type": "loss", "content": 0.3377070426940918, "timestamp": "2025-09-05 08:49:41.468889", "step": 46, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:41.666652", "step": 46, "epoch": 1 }, { "type": "loss", "content": 0.49804773926734924, "timestamp": "2025-09-05 08:49:41.668696", "step": 47, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:41.866243", "step": 47, "epoch": 1 }, { "type": "loss", "content": 0.5739774703979492, "timestamp": "2025-09-05 08:49:41.881595", "step": 48, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:42.080545", "step": 48, "epoch": 1 }, { "type": "loss", "content": 0.44619235396385193, "timestamp": "2025-09-05 08:49:42.082562", "step": 49, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:42.287133", "step": 49, "epoch": 1 }, { "type": "loss", "content": 0.5007292032241821, "timestamp": "2025-09-05 08:49:42.289340", "step": 50, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:42.490217", "step": 50, "epoch": 1 }, { "type": "loss", "content": 0.47838714718818665, "timestamp": "2025-09-05 08:49:42.492212", "step": 51, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:42.699436", "step": 51, "epoch": 1 }, { "type": "loss", "content": 0.5947479605674744, "timestamp": "2025-09-05 08:49:42.714002", "step": 52, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:42.904233", "step": 52, "epoch": 1 }, { "type": "loss", "content": 0.4078122675418854, "timestamp": "2025-09-05 08:49:42.906345", "step": 53, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:43.103998", "step": 53, "epoch": 1 }, { "type": "loss", "content": 0.3910931646823883, "timestamp": "2025-09-05 08:49:43.106013", "step": 54, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:43.306892", "step": 54, "epoch": 1 }, { "type": "loss", "content": 0.5116702318191528, "timestamp": "2025-09-05 08:49:43.308847", "step": 55, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:43.506454", "step": 55, "epoch": 1 }, { "type": "loss", "content": 0.40538397431373596, "timestamp": "2025-09-05 08:49:43.523075", "step": 56, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:43.720676", "step": 56, "epoch": 1 }, { "type": "loss", "content": 0.3850765526294708, "timestamp": "2025-09-05 08:49:43.722821", "step": 57, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:43.919447", "step": 57, "epoch": 1 }, { "type": "loss", "content": 0.5028924345970154, "timestamp": "2025-09-05 08:49:43.922267", "step": 58, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:44.120144", "step": 58, "epoch": 1 }, { "type": "loss", "content": 0.5674346089363098, "timestamp": "2025-09-05 08:49:44.122315", "step": 59, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:44.318793", "step": 59, "epoch": 1 }, { "type": "loss", "content": 0.4544109106063843, "timestamp": "2025-09-05 08:49:44.332801", "step": 60, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:49:48.947455", "step": 60, "epoch": 1 }, { "type": "pplx", "content": 80.36248908828922, "timestamp": "2025-09-05 08:49:48.949747", "step": 60, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:49.112997", "step": 60, "epoch": 1 }, { "type": "loss", "content": 0.4171891510486603, "timestamp": "2025-09-05 08:49:49.115100", "step": 61, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:49.283044", "step": 61, "epoch": 1 }, { "type": "loss", "content": 0.32811057567596436, "timestamp": "2025-09-05 08:49:49.285337", "step": 62, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:49.491105", "step": 62, "epoch": 1 }, { "type": "loss", "content": 0.5731877088546753, "timestamp": "2025-09-05 08:49:49.493306", "step": 63, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:49.692196", "step": 63, "epoch": 1 }, { "type": "loss", "content": 0.49480125308036804, "timestamp": "2025-09-05 08:49:49.708875", "step": 64, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:49.906046", "step": 64, "epoch": 1 }, { "type": "loss", "content": 0.5468859076499939, "timestamp": "2025-09-05 08:49:49.908063", "step": 65, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:50.105533", "step": 65, "epoch": 1 }, { "type": "loss", "content": 0.40803736448287964, "timestamp": "2025-09-05 08:49:50.107576", "step": 66, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:50.303363", "step": 66, "epoch": 1 }, { "type": "loss", "content": 0.5299073457717896, "timestamp": "2025-09-05 08:49:50.306069", "step": 67, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:50.506081", "step": 67, "epoch": 1 }, { "type": "loss", "content": 0.42571789026260376, "timestamp": "2025-09-05 08:49:50.526032", "step": 68, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:50.716930", "step": 68, "epoch": 1 }, { "type": "loss", "content": 0.4201086461544037, "timestamp": "2025-09-05 08:49:50.718937", "step": 69, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:50.916075", "step": 69, "epoch": 1 }, { "type": "loss", "content": 0.4884677231311798, "timestamp": "2025-09-05 08:49:50.918031", "step": 70, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:51.116971", "step": 70, "epoch": 1 }, { "type": "loss", "content": 0.41253289580345154, "timestamp": "2025-09-05 08:49:51.118932", "step": 71, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:51.326776", "step": 71, "epoch": 1 }, { "type": "loss", "content": 0.45812252163887024, "timestamp": "2025-09-05 08:49:51.341164", "step": 72, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:51.538100", "step": 72, "epoch": 1 }, { "type": "loss", "content": 0.5628377795219421, "timestamp": "2025-09-05 08:49:51.540135", "step": 73, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:49:51.742012", "step": 73, "epoch": 1 }, { "type": "loss", "content": 0.5147489905357361, "timestamp": "2025-09-05 08:49:51.743976", "step": 74, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:51.950073", "step": 74, "epoch": 1 }, { "type": "loss", "content": 0.36594030261039734, "timestamp": "2025-09-05 08:49:51.952199", "step": 75, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:52.149922", "step": 75, "epoch": 1 }, { "type": "loss", "content": 0.4461878538131714, "timestamp": "2025-09-05 08:49:52.164170", "step": 76, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:52.353769", "step": 76, "epoch": 1 }, { "type": "loss", "content": 0.4893703758716583, "timestamp": "2025-09-05 08:49:52.355812", "step": 77, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:52.552180", "step": 77, "epoch": 1 }, { "type": "loss", "content": 0.4122447967529297, "timestamp": "2025-09-05 08:49:52.554284", "step": 78, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:52.762974", "step": 78, "epoch": 1 }, { "type": "loss", "content": 0.37286263704299927, "timestamp": "2025-09-05 08:49:52.764953", "step": 79, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:52.959916", "step": 79, "epoch": 1 }, { "type": "loss", "content": 0.32313352823257446, "timestamp": "2025-09-05 08:49:52.974096", "step": 80, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:49:57.618632", "step": 80, "epoch": 1 }, { "type": "pplx", "content": 75.94659559691486, "timestamp": "2025-09-05 08:49:57.620939", "step": 80, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 80", "timestamp": "2025-09-05 08:49:58.082937", "step": 80, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:58.253682", "step": 80, "epoch": 1 }, { "type": "loss", "content": 0.4822365939617157, "timestamp": "2025-09-05 08:49:58.255959", "step": 81, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:58.453126", "step": 81, "epoch": 1 }, { "type": "loss", "content": 0.41527748107910156, "timestamp": "2025-09-05 08:49:58.455406", "step": 82, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:49:58.651029", "step": 82, "epoch": 1 }, { "type": "loss", "content": 0.49072203040122986, "timestamp": "2025-09-05 08:49:58.652972", "step": 83, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:49:58.852717", "step": 83, "epoch": 1 }, { "type": "loss", "content": 0.48262521624565125, "timestamp": "2025-09-05 08:49:58.867136", "step": 84, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:49:59.058672", "step": 84, "epoch": 1 }, { "type": "loss", "content": 0.4185166656970978, "timestamp": "2025-09-05 08:49:59.060765", "step": 85, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:59.257019", "step": 85, "epoch": 1 }, { "type": "loss", "content": 0.37654638290405273, "timestamp": "2025-09-05 08:49:59.259103", "step": 86, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:49:59.468228", "step": 86, "epoch": 1 }, { "type": "loss", "content": 0.4784839451313019, "timestamp": "2025-09-05 08:49:59.470022", "step": 87, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:49:59.668389", "step": 87, "epoch": 1 }, { "type": "loss", "content": 0.4580772817134857, "timestamp": "2025-09-05 08:49:59.684569", "step": 88, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:49:59.884885", "step": 88, "epoch": 1 }, { "type": "loss", "content": 0.4850596487522125, "timestamp": "2025-09-05 08:49:59.887173", "step": 89, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:00.083921", "step": 89, "epoch": 1 }, { "type": "loss", "content": 0.4436725974082947, "timestamp": "2025-09-05 08:50:00.085702", "step": 90, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:00.292103", "step": 90, "epoch": 1 }, { "type": "loss", "content": 0.3864012360572815, "timestamp": "2025-09-05 08:50:00.294189", "step": 91, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:00.493198", "step": 91, "epoch": 1 }, { "type": "loss", "content": 0.41005319356918335, "timestamp": "2025-09-05 08:50:00.507541", "step": 92, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:00.697222", "step": 92, "epoch": 1 }, { "type": "loss", "content": 0.29754865169525146, "timestamp": "2025-09-05 08:50:00.699018", "step": 93, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:00.895786", "step": 93, "epoch": 1 }, { "type": "loss", "content": 0.5100644826889038, "timestamp": "2025-09-05 08:50:00.897573", "step": 94, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:50:01.104437", "step": 94, "epoch": 1 }, { "type": "loss", "content": 0.5071147680282593, "timestamp": "2025-09-05 08:50:01.106343", "step": 95, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:01.304995", "step": 95, "epoch": 1 }, { "type": "loss", "content": 0.5573170781135559, "timestamp": "2025-09-05 08:50:01.319332", "step": 96, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:01.517000", "step": 96, "epoch": 1 }, { "type": "loss", "content": 0.5759362578392029, "timestamp": "2025-09-05 08:50:01.518928", "step": 97, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:01.718213", "step": 97, "epoch": 1 }, { "type": "loss", "content": 0.44677627086639404, "timestamp": "2025-09-05 08:50:01.720227", "step": 98, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:01.927026", "step": 98, "epoch": 1 }, { "type": "loss", "content": 0.5631275177001953, "timestamp": "2025-09-05 08:50:01.928989", "step": 99, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:02.126438", "step": 99, "epoch": 1 }, { "type": "loss", "content": 0.39676016569137573, "timestamp": "2025-09-05 08:50:02.141475", "step": 100, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:50:06.771451", "step": 100, "epoch": 1 }, { "type": "pplx", "content": 73.19032358278096, "timestamp": "2025-09-05 08:50:06.773213", "step": 100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:06.935894", "step": 100, "epoch": 1 }, { "type": "loss", "content": 0.34731829166412354, "timestamp": "2025-09-05 08:50:06.937859", "step": 101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:50:07.147849", "step": 101, "epoch": 1 }, { "type": "loss", "content": 0.36952462792396545, "timestamp": "2025-09-05 08:50:07.149839", "step": 102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:07.355748", "step": 102, "epoch": 1 }, { "type": "loss", "content": 0.4270903468132019, "timestamp": "2025-09-05 08:50:07.357775", "step": 103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:07.555093", "step": 103, "epoch": 1 }, { "type": "loss", "content": 0.3678099513053894, "timestamp": "2025-09-05 08:50:07.569703", "step": 104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:07.766139", "step": 104, "epoch": 1 }, { "type": "loss", "content": 0.40596431493759155, "timestamp": "2025-09-05 08:50:07.768238", "step": 105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:07.965392", "step": 105, "epoch": 1 }, { "type": "loss", "content": 0.44920292496681213, "timestamp": "2025-09-05 08:50:07.967648", "step": 106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:08.174690", "step": 106, "epoch": 1 }, { "type": "loss", "content": 0.27072614431381226, "timestamp": "2025-09-05 08:50:08.176704", "step": 107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:08.374070", "step": 107, "epoch": 1 }, { "type": "loss", "content": 0.44459718465805054, "timestamp": "2025-09-05 08:50:08.388419", "step": 108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:50:08.578453", "step": 108, "epoch": 1 }, { "type": "loss", "content": 0.3547823131084442, "timestamp": "2025-09-05 08:50:08.580293", "step": 109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:08.777417", "step": 109, "epoch": 1 }, { "type": "loss", "content": 0.32400763034820557, "timestamp": "2025-09-05 08:50:08.779441", "step": 110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:08.985458", "step": 110, "epoch": 1 }, { "type": "loss", "content": 0.3592005968093872, "timestamp": "2025-09-05 08:50:08.987409", "step": 111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:09.193950", "step": 111, "epoch": 1 }, { "type": "loss", "content": 0.6158202886581421, "timestamp": "2025-09-05 08:50:09.208214", "step": 112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:09.399264", "step": 112, "epoch": 1 }, { "type": "loss", "content": 0.36427071690559387, "timestamp": "2025-09-05 08:50:09.401514", "step": 113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:09.610077", "step": 113, "epoch": 1 }, { "type": "loss", "content": 0.28574255108833313, "timestamp": "2025-09-05 08:50:09.611869", "step": 114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:09.810026", "step": 114, "epoch": 1 }, { "type": "loss", "content": 0.33345794677734375, "timestamp": "2025-09-05 08:50:09.812004", "step": 115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:10.009667", "step": 115, "epoch": 1 }, { "type": "loss", "content": 0.368472695350647, "timestamp": "2025-09-05 08:50:10.026099", "step": 116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:10.224608", "step": 116, "epoch": 1 }, { "type": "loss", "content": 0.3890371322631836, "timestamp": "2025-09-05 08:50:10.226530", "step": 117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:10.433837", "step": 117, "epoch": 1 }, { "type": "loss", "content": 0.44138211011886597, "timestamp": "2025-09-05 08:50:10.435730", "step": 118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:10.652742", "step": 118, "epoch": 1 }, { "type": "loss", "content": 0.3401064872741699, "timestamp": "2025-09-05 08:50:10.654683", "step": 119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:10.860449", "step": 119, "epoch": 1 }, { "type": "loss", "content": 0.35354849696159363, "timestamp": "2025-09-05 08:50:10.876223", "step": 120, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:50:15.516995", "step": 120, "epoch": 1 }, { "type": "pplx", "content": 71.52772055113894, "timestamp": "2025-09-05 08:50:15.519261", "step": 120, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 120", "timestamp": "2025-09-05 08:50:15.977316", "step": 120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:16.152308", "step": 120, "epoch": 1 }, { "type": "loss", "content": 0.4743736684322357, "timestamp": "2025-09-05 08:50:16.155092", "step": 121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:16.352168", "step": 121, "epoch": 1 }, { "type": "loss", "content": 0.36207839846611023, "timestamp": "2025-09-05 08:50:16.354167", "step": 122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:16.554847", "step": 122, "epoch": 1 }, { "type": "loss", "content": 0.4461155831813812, "timestamp": "2025-09-05 08:50:16.556726", "step": 123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:16.765987", "step": 123, "epoch": 1 }, { "type": "loss", "content": 0.3289283215999603, "timestamp": "2025-09-05 08:50:16.780395", "step": 124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:16.978534", "step": 124, "epoch": 1 }, { "type": "loss", "content": 0.39339011907577515, "timestamp": "2025-09-05 08:50:16.980573", "step": 125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:17.179671", "step": 125, "epoch": 1 }, { "type": "loss", "content": 0.3922661244869232, "timestamp": "2025-09-05 08:50:17.181546", "step": 126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:17.379811", "step": 126, "epoch": 1 }, { "type": "loss", "content": 0.48367026448249817, "timestamp": "2025-09-05 08:50:17.381618", "step": 127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:17.579726", "step": 127, "epoch": 1 }, { "type": "loss", "content": 0.3444158434867859, "timestamp": "2025-09-05 08:50:17.596345", "step": 128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:17.793894", "step": 128, "epoch": 1 }, { "type": "loss", "content": 0.4369712471961975, "timestamp": "2025-09-05 08:50:17.795934", "step": 129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:17.993686", "step": 129, "epoch": 1 }, { "type": "loss", "content": 0.331216037273407, "timestamp": "2025-09-05 08:50:17.995933", "step": 130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:18.193092", "step": 130, "epoch": 1 }, { "type": "loss", "content": 0.3580273389816284, "timestamp": "2025-09-05 08:50:18.195148", "step": 131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:18.392606", "step": 131, "epoch": 1 }, { "type": "loss", "content": 0.31754398345947266, "timestamp": "2025-09-05 08:50:18.407026", "step": 132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:50:18.594763", "step": 132, "epoch": 1 }, { "type": "loss", "content": 0.5454202890396118, "timestamp": "2025-09-05 08:50:18.596921", "step": 133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:18.794716", "step": 133, "epoch": 1 }, { "type": "loss", "content": 0.43953007459640503, "timestamp": "2025-09-05 08:50:18.796695", "step": 134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:18.994641", "step": 134, "epoch": 1 }, { "type": "loss", "content": 0.31143492460250854, "timestamp": "2025-09-05 08:50:18.996835", "step": 135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:19.193535", "step": 135, "epoch": 1 }, { "type": "loss", "content": 0.48143064975738525, "timestamp": "2025-09-05 08:50:19.210065", "step": 136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:19.406796", "step": 136, "epoch": 1 }, { "type": "loss", "content": 0.3588726818561554, "timestamp": "2025-09-05 08:50:19.408708", "step": 137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:19.606770", "step": 137, "epoch": 1 }, { "type": "loss", "content": 0.38282686471939087, "timestamp": "2025-09-05 08:50:19.609583", "step": 138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:19.815782", "step": 138, "epoch": 1 }, { "type": "loss", "content": 0.42063143849372864, "timestamp": "2025-09-05 08:50:19.818967", "step": 139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:20.017464", "step": 139, "epoch": 1 }, { "type": "loss", "content": 0.4207852780818939, "timestamp": "2025-09-05 08:50:20.031795", "step": 140, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:50:24.652104", "step": 140, "epoch": 1 }, { "type": "pplx", "content": 70.65361235350733, "timestamp": "2025-09-05 08:50:24.654036", "step": 140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:24.817137", "step": 140, "epoch": 1 }, { "type": "loss", "content": 0.34773746132850647, "timestamp": "2025-09-05 08:50:24.819048", "step": 141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:25.026073", "step": 141, "epoch": 1 }, { "type": "loss", "content": 0.4449472725391388, "timestamp": "2025-09-05 08:50:25.027902", "step": 142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:25.224932", "step": 142, "epoch": 1 }, { "type": "loss", "content": 0.4693523645401001, "timestamp": "2025-09-05 08:50:25.231172", "step": 143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:50:25.439007", "step": 143, "epoch": 1 }, { "type": "loss", "content": 0.3564371168613434, "timestamp": "2025-09-05 08:50:25.453380", "step": 144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:25.650784", "step": 144, "epoch": 1 }, { "type": "loss", "content": 0.37880274653434753, "timestamp": "2025-09-05 08:50:25.652686", "step": 145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:25.859559", "step": 145, "epoch": 1 }, { "type": "loss", "content": 0.31610339879989624, "timestamp": "2025-09-05 08:50:25.862056", "step": 146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:26.059662", "step": 146, "epoch": 1 }, { "type": "loss", "content": 0.46133285760879517, "timestamp": "2025-09-05 08:50:26.061481", "step": 147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:26.268593", "step": 147, "epoch": 1 }, { "type": "loss", "content": 0.413718044757843, "timestamp": "2025-09-05 08:50:26.283153", "step": 148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:26.469459", "step": 148, "epoch": 1 }, { "type": "loss", "content": 0.5549363493919373, "timestamp": "2025-09-05 08:50:26.471379", "step": 149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:26.677868", "step": 149, "epoch": 1 }, { "type": "loss", "content": 0.29777753353118896, "timestamp": "2025-09-05 08:50:26.679559", "step": 150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:26.875740", "step": 150, "epoch": 1 }, { "type": "loss", "content": 0.2828120291233063, "timestamp": "2025-09-05 08:50:26.878140", "step": 151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:27.073797", "step": 151, "epoch": 1 }, { "type": "loss", "content": 0.4168826639652252, "timestamp": "2025-09-05 08:50:27.088398", "step": 152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:27.277947", "step": 152, "epoch": 1 }, { "type": "loss", "content": 0.35562440752983093, "timestamp": "2025-09-05 08:50:27.279668", "step": 153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:27.475780", "step": 153, "epoch": 1 }, { "type": "loss", "content": 0.4816155433654785, "timestamp": "2025-09-05 08:50:27.477761", "step": 154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:27.675934", "step": 154, "epoch": 1 }, { "type": "loss", "content": 0.48008760809898376, "timestamp": "2025-09-05 08:50:27.677685", "step": 155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:27.883926", "step": 155, "epoch": 1 }, { "type": "loss", "content": 0.42271170020103455, "timestamp": "2025-09-05 08:50:27.898295", "step": 156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:28.088213", "step": 156, "epoch": 1 }, { "type": "loss", "content": 0.3952147960662842, "timestamp": "2025-09-05 08:50:28.090116", "step": 157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:28.286788", "step": 157, "epoch": 1 }, { "type": "loss", "content": 0.3233911097049713, "timestamp": "2025-09-05 08:50:28.288735", "step": 158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:28.496521", "step": 158, "epoch": 1 }, { "type": "loss", "content": 0.49929437041282654, "timestamp": "2025-09-05 08:50:28.498313", "step": 159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:28.704841", "step": 159, "epoch": 1 }, { "type": "loss", "content": 0.4313942492008209, "timestamp": "2025-09-05 08:50:28.719313", "step": 160, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:50:33.321295", "step": 160, "epoch": 1 }, { "type": "pplx", "content": 70.65175573066632, "timestamp": "2025-09-05 08:50:33.323369", "step": 160, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 160", "timestamp": "2025-09-05 08:50:33.770272", "step": 160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:33.957466", "step": 160, "epoch": 1 }, { "type": "loss", "content": 0.34339627623558044, "timestamp": "2025-09-05 08:50:33.959506", "step": 161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:34.164500", "step": 161, "epoch": 1 }, { "type": "loss", "content": 0.45955607295036316, "timestamp": "2025-09-05 08:50:34.166511", "step": 162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:50:34.364395", "step": 162, "epoch": 1 }, { "type": "loss", "content": 0.4148689806461334, "timestamp": "2025-09-05 08:50:34.366375", "step": 163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:34.572456", "step": 163, "epoch": 1 }, { "type": "loss", "content": 0.34146106243133545, "timestamp": "2025-09-05 08:50:34.588946", "step": 164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:34.786564", "step": 164, "epoch": 1 }, { "type": "loss", "content": 0.3519960045814514, "timestamp": "2025-09-05 08:50:34.788408", "step": 165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:34.985928", "step": 165, "epoch": 1 }, { "type": "loss", "content": 0.2612743675708771, "timestamp": "2025-09-05 08:50:34.988031", "step": 166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:35.186266", "step": 166, "epoch": 1 }, { "type": "loss", "content": 0.37086915969848633, "timestamp": "2025-09-05 08:50:35.188111", "step": 167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:35.395045", "step": 167, "epoch": 1 }, { "type": "loss", "content": 0.3923819661140442, "timestamp": "2025-09-05 08:50:35.411448", "step": 168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:35.605525", "step": 168, "epoch": 1 }, { "type": "loss", "content": 0.5148409605026245, "timestamp": "2025-09-05 08:50:35.607404", "step": 169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:35.803412", "step": 169, "epoch": 1 }, { "type": "loss", "content": 0.34032538533210754, "timestamp": "2025-09-05 08:50:35.805098", "step": 170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:36.010702", "step": 170, "epoch": 1 }, { "type": "loss", "content": 0.5385991930961609, "timestamp": "2025-09-05 08:50:36.012528", "step": 171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:36.211564", "step": 171, "epoch": 1 }, { "type": "loss", "content": 0.3534490466117859, "timestamp": "2025-09-05 08:50:36.225837", "step": 172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:36.419293", "step": 172, "epoch": 1 }, { "type": "loss", "content": 0.3421189785003662, "timestamp": "2025-09-05 08:50:36.421098", "step": 173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:36.626841", "step": 173, "epoch": 1 }, { "type": "loss", "content": 0.2873148024082184, "timestamp": "2025-09-05 08:50:36.628694", "step": 174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:36.826986", "step": 174, "epoch": 1 }, { "type": "loss", "content": 0.28435009717941284, "timestamp": "2025-09-05 08:50:36.828813", "step": 175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:37.025748", "step": 175, "epoch": 1 }, { "type": "loss", "content": 0.3528280258178711, "timestamp": "2025-09-05 08:50:37.039984", "step": 176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:37.229084", "step": 176, "epoch": 1 }, { "type": "loss", "content": 0.4834965467453003, "timestamp": "2025-09-05 08:50:37.230935", "step": 177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:37.428456", "step": 177, "epoch": 1 }, { "type": "loss", "content": 0.33752936124801636, "timestamp": "2025-09-05 08:50:37.430302", "step": 178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:37.636783", "step": 178, "epoch": 1 }, { "type": "loss", "content": 0.44458168745040894, "timestamp": "2025-09-05 08:50:37.638938", "step": 179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:37.837233", "step": 179, "epoch": 1 }, { "type": "loss", "content": 0.40980052947998047, "timestamp": "2025-09-05 08:50:37.853984", "step": 180, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:50:42.486191", "step": 180, "epoch": 1 }, { "type": "pplx", "content": 69.88217681460745, "timestamp": "2025-09-05 08:50:42.488185", "step": 180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:50:42.649403", "step": 180, "epoch": 1 }, { "type": "loss", "content": 0.3106338381767273, "timestamp": "2025-09-05 08:50:42.653048", "step": 181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:42.858542", "step": 181, "epoch": 1 }, { "type": "loss", "content": 0.33472582697868347, "timestamp": "2025-09-05 08:50:42.861054", "step": 182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:43.059529", "step": 182, "epoch": 1 }, { "type": "loss", "content": 0.5153182148933411, "timestamp": "2025-09-05 08:50:43.061699", "step": 183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:43.259054", "step": 183, "epoch": 1 }, { "type": "loss", "content": 0.3759649693965912, "timestamp": "2025-09-05 08:50:43.275668", "step": 184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:43.473055", "step": 184, "epoch": 1 }, { "type": "loss", "content": 0.25464075803756714, "timestamp": "2025-09-05 08:50:43.475222", "step": 185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:43.672348", "step": 185, "epoch": 1 }, { "type": "loss", "content": 0.46503615379333496, "timestamp": "2025-09-05 08:50:43.674547", "step": 186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:43.870814", "step": 186, "epoch": 1 }, { "type": "loss", "content": 0.36886751651763916, "timestamp": "2025-09-05 08:50:43.872745", "step": 187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:44.061475", "step": 187, "epoch": 1 }, { "type": "loss", "content": 0.31488415598869324, "timestamp": "2025-09-05 08:50:44.076163", "step": 188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:44.257798", "step": 188, "epoch": 1 }, { "type": "loss", "content": 0.41387924551963806, "timestamp": "2025-09-05 08:50:44.259666", "step": 189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:44.463752", "step": 189, "epoch": 1 }, { "type": "loss", "content": 0.3407166600227356, "timestamp": "2025-09-05 08:50:44.465741", "step": 190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:44.662388", "step": 190, "epoch": 1 }, { "type": "loss", "content": 0.3083406686782837, "timestamp": "2025-09-05 08:50:44.666316", "step": 191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:44.866525", "step": 191, "epoch": 1 }, { "type": "loss", "content": 0.46717512607574463, "timestamp": "2025-09-05 08:50:44.882923", "step": 192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:45.079418", "step": 192, "epoch": 1 }, { "type": "loss", "content": 0.37655341625213623, "timestamp": "2025-09-05 08:50:45.082245", "step": 193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:45.286188", "step": 193, "epoch": 1 }, { "type": "loss", "content": 0.2714085876941681, "timestamp": "2025-09-05 08:50:45.289071", "step": 194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:45.489108", "step": 194, "epoch": 1 }, { "type": "loss", "content": 0.44437673687934875, "timestamp": "2025-09-05 08:50:45.491274", "step": 195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:45.693314", "step": 195, "epoch": 1 }, { "type": "loss", "content": 0.525282621383667, "timestamp": "2025-09-05 08:50:45.708374", "step": 196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:45.900556", "step": 196, "epoch": 1 }, { "type": "loss", "content": 0.3641051650047302, "timestamp": "2025-09-05 08:50:45.905337", "step": 197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:46.107005", "step": 197, "epoch": 1 }, { "type": "loss", "content": 0.306657075881958, "timestamp": "2025-09-05 08:50:46.109337", "step": 198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:46.315314", "step": 198, "epoch": 1 }, { "type": "loss", "content": 0.3821370601654053, "timestamp": "2025-09-05 08:50:46.318001", "step": 199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:46.530150", "step": 199, "epoch": 1 }, { "type": "loss", "content": 0.3802856504917145, "timestamp": "2025-09-05 08:50:46.546252", "step": 200, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:50:51.304338", "step": 200, "epoch": 1 }, { "type": "pplx", "content": 68.43342716093302, "timestamp": "2025-09-05 08:50:51.307018", "step": 200, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 200", "timestamp": "2025-09-05 08:50:51.794754", "step": 200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:51.964613", "step": 200, "epoch": 1 }, { "type": "loss", "content": 0.32048162817955017, "timestamp": "2025-09-05 08:50:51.966794", "step": 201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:52.173366", "step": 201, "epoch": 1 }, { "type": "loss", "content": 0.3719877600669861, "timestamp": "2025-09-05 08:50:52.175903", "step": 202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:50:52.374299", "step": 202, "epoch": 1 }, { "type": "loss", "content": 0.47192662954330444, "timestamp": "2025-09-05 08:50:52.376239", "step": 203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:52.573112", "step": 203, "epoch": 1 }, { "type": "loss", "content": 0.35540571808815, "timestamp": "2025-09-05 08:50:52.587796", "step": 204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:52.777809", "step": 204, "epoch": 1 }, { "type": "loss", "content": 0.4534974694252014, "timestamp": "2025-09-05 08:50:52.779680", "step": 205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:52.976475", "step": 205, "epoch": 1 }, { "type": "loss", "content": 0.2541635036468506, "timestamp": "2025-09-05 08:50:52.978872", "step": 206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:53.186056", "step": 206, "epoch": 1 }, { "type": "loss", "content": 0.4390164017677307, "timestamp": "2025-09-05 08:50:53.187977", "step": 207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:53.394871", "step": 207, "epoch": 1 }, { "type": "loss", "content": 0.36420944333076477, "timestamp": "2025-09-05 08:50:53.409950", "step": 208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:53.598684", "step": 208, "epoch": 1 }, { "type": "loss", "content": 0.3081270754337311, "timestamp": "2025-09-05 08:50:53.601276", "step": 209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:53.807163", "step": 209, "epoch": 1 }, { "type": "loss", "content": 0.32181161642074585, "timestamp": "2025-09-05 08:50:53.809123", "step": 210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:54.016227", "step": 210, "epoch": 1 }, { "type": "loss", "content": 0.37587931752204895, "timestamp": "2025-09-05 08:50:54.018183", "step": 211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:54.224894", "step": 211, "epoch": 1 }, { "type": "loss", "content": 0.3801794648170471, "timestamp": "2025-09-05 08:50:54.241161", "step": 212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:54.438898", "step": 212, "epoch": 1 }, { "type": "loss", "content": 0.3296459913253784, "timestamp": "2025-09-05 08:50:54.440903", "step": 213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:54.636463", "step": 213, "epoch": 1 }, { "type": "loss", "content": 0.40059760212898254, "timestamp": "2025-09-05 08:50:54.638462", "step": 214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:50:54.845342", "step": 214, "epoch": 1 }, { "type": "loss", "content": 0.45545437932014465, "timestamp": "2025-09-05 08:50:54.847321", "step": 215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:50:55.045554", "step": 215, "epoch": 1 }, { "type": "loss", "content": 0.5213336944580078, "timestamp": "2025-09-05 08:50:55.059885", "step": 216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:50:55.248331", "step": 216, "epoch": 1 }, { "type": "loss", "content": 0.3463953733444214, "timestamp": "2025-09-05 08:50:55.250555", "step": 217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:55.456155", "step": 217, "epoch": 1 }, { "type": "loss", "content": 0.36651042103767395, "timestamp": "2025-09-05 08:50:55.458184", "step": 218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:55.664880", "step": 218, "epoch": 1 }, { "type": "loss", "content": 0.39168816804885864, "timestamp": "2025-09-05 08:50:55.666768", "step": 219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:50:55.871436", "step": 219, "epoch": 1 }, { "type": "loss", "content": 0.43237030506134033, "timestamp": "2025-09-05 08:50:55.885583", "step": 220, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:51:00.528160", "step": 220, "epoch": 1 }, { "type": "pplx", "content": 66.70753911619636, "timestamp": "2025-09-05 08:51:00.530320", "step": 220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:00.694163", "step": 220, "epoch": 1 }, { "type": "loss", "content": 0.27549687027931213, "timestamp": "2025-09-05 08:51:00.696251", "step": 221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:00.863090", "step": 221, "epoch": 1 }, { "type": "loss", "content": 0.4633401036262512, "timestamp": "2025-09-05 08:51:00.865224", "step": 222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:01.071101", "step": 222, "epoch": 1 }, { "type": "loss", "content": 0.297171413898468, "timestamp": "2025-09-05 08:51:01.073322", "step": 223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:01.281221", "step": 223, "epoch": 1 }, { "type": "loss", "content": 0.38710904121398926, "timestamp": "2025-09-05 08:51:01.298898", "step": 224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:01.496439", "step": 224, "epoch": 1 }, { "type": "loss", "content": 0.4238070249557495, "timestamp": "2025-09-05 08:51:01.498489", "step": 225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:01.706000", "step": 225, "epoch": 1 }, { "type": "loss", "content": 0.4317415952682495, "timestamp": "2025-09-05 08:51:01.708487", "step": 226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:01.908165", "step": 226, "epoch": 1 }, { "type": "loss", "content": 0.31439393758773804, "timestamp": "2025-09-05 08:51:01.910255", "step": 227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:02.116936", "step": 227, "epoch": 1 }, { "type": "loss", "content": 0.2890167236328125, "timestamp": "2025-09-05 08:51:02.131642", "step": 228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:02.324743", "step": 228, "epoch": 1 }, { "type": "loss", "content": 0.4581849277019501, "timestamp": "2025-09-05 08:51:02.327444", "step": 229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:02.525196", "step": 229, "epoch": 1 }, { "type": "loss", "content": 0.3810874819755554, "timestamp": "2025-09-05 08:51:02.527445", "step": 230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:02.733657", "step": 230, "epoch": 1 }, { "type": "loss", "content": 0.2984119653701782, "timestamp": "2025-09-05 08:51:02.736785", "step": 231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:02.934302", "step": 231, "epoch": 1 }, { "type": "loss", "content": 0.37125611305236816, "timestamp": "2025-09-05 08:51:02.948305", "step": 232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:03.144451", "step": 232, "epoch": 1 }, { "type": "loss", "content": 0.2847510874271393, "timestamp": "2025-09-05 08:51:03.146597", "step": 233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:03.352972", "step": 233, "epoch": 1 }, { "type": "loss", "content": 0.33447766304016113, "timestamp": "2025-09-05 08:51:03.355044", "step": 234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:03.552131", "step": 234, "epoch": 1 }, { "type": "loss", "content": 0.32979437708854675, "timestamp": "2025-09-05 08:51:03.554213", "step": 235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:03.751051", "step": 235, "epoch": 1 }, { "type": "loss", "content": 0.45558422803878784, "timestamp": "2025-09-05 08:51:03.764967", "step": 236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:03.953115", "step": 236, "epoch": 1 }, { "type": "loss", "content": 0.3728318214416504, "timestamp": "2025-09-05 08:51:03.955116", "step": 237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:04.152915", "step": 237, "epoch": 1 }, { "type": "loss", "content": 0.31200921535491943, "timestamp": "2025-09-05 08:51:04.155039", "step": 238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:04.360922", "step": 238, "epoch": 1 }, { "type": "loss", "content": 0.548952579498291, "timestamp": "2025-09-05 08:51:04.362714", "step": 239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:04.560346", "step": 239, "epoch": 1 }, { "type": "loss", "content": 0.28765401244163513, "timestamp": "2025-09-05 08:51:04.576314", "step": 240, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:51:09.223025", "step": 240, "epoch": 1 }, { "type": "pplx", "content": 65.06928550317252, "timestamp": "2025-09-05 08:51:09.225006", "step": 240, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 240", "timestamp": "2025-09-05 08:51:09.703186", "step": 240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:09.874355", "step": 240, "epoch": 1 }, { "type": "loss", "content": 0.3755890130996704, "timestamp": "2025-09-05 08:51:09.876365", "step": 241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:10.072449", "step": 241, "epoch": 1 }, { "type": "loss", "content": 0.38877207040786743, "timestamp": "2025-09-05 08:51:10.074440", "step": 242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:10.280829", "step": 242, "epoch": 1 }, { "type": "loss", "content": 0.49628746509552, "timestamp": "2025-09-05 08:51:10.282995", "step": 243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:10.482218", "step": 243, "epoch": 1 }, { "type": "loss", "content": 0.5397868752479553, "timestamp": "2025-09-05 08:51:10.498957", "step": 244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:10.696694", "step": 244, "epoch": 1 }, { "type": "loss", "content": 0.3450985848903656, "timestamp": "2025-09-05 08:51:10.698570", "step": 245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:10.893683", "step": 245, "epoch": 1 }, { "type": "loss", "content": 0.30554214119911194, "timestamp": "2025-09-05 08:51:10.895563", "step": 246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:11.091433", "step": 246, "epoch": 1 }, { "type": "loss", "content": 0.3367531895637512, "timestamp": "2025-09-05 08:51:11.093469", "step": 247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:11.298971", "step": 247, "epoch": 1 }, { "type": "loss", "content": 0.33166182041168213, "timestamp": "2025-09-05 08:51:11.313272", "step": 248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:11.503053", "step": 248, "epoch": 1 }, { "type": "loss", "content": 0.2959182858467102, "timestamp": "2025-09-05 08:51:11.505145", "step": 249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:11.703129", "step": 249, "epoch": 1 }, { "type": "loss", "content": 0.3209533095359802, "timestamp": "2025-09-05 08:51:11.705161", "step": 250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:11.902631", "step": 250, "epoch": 1 }, { "type": "loss", "content": 0.3812739849090576, "timestamp": "2025-09-05 08:51:11.904804", "step": 251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:12.103336", "step": 251, "epoch": 1 }, { "type": "loss", "content": 0.31804463267326355, "timestamp": "2025-09-05 08:51:12.119920", "step": 252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:12.325726", "step": 252, "epoch": 1 }, { "type": "loss", "content": 0.38454669713974, "timestamp": "2025-09-05 08:51:12.327480", "step": 253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:12.533970", "step": 253, "epoch": 1 }, { "type": "loss", "content": 0.357085257768631, "timestamp": "2025-09-05 08:51:12.535934", "step": 254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:12.732659", "step": 254, "epoch": 1 }, { "type": "loss", "content": 0.5086879730224609, "timestamp": "2025-09-05 08:51:12.734686", "step": 255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:51:12.942675", "step": 255, "epoch": 1 }, { "type": "loss", "content": 0.31578245759010315, "timestamp": "2025-09-05 08:51:12.957030", "step": 256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:13.145896", "step": 256, "epoch": 1 }, { "type": "loss", "content": 0.530243456363678, "timestamp": "2025-09-05 08:51:13.147719", "step": 257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:13.344584", "step": 257, "epoch": 1 }, { "type": "loss", "content": 0.3682427406311035, "timestamp": "2025-09-05 08:51:13.346470", "step": 258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:13.543961", "step": 258, "epoch": 1 }, { "type": "loss", "content": 0.3959914743900299, "timestamp": "2025-09-05 08:51:13.545895", "step": 259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:13.744614", "step": 259, "epoch": 1 }, { "type": "loss", "content": 0.46029427647590637, "timestamp": "2025-09-05 08:51:13.758973", "step": 260, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:51:18.409861", "step": 260, "epoch": 1 }, { "type": "pplx", "content": 64.26154569680897, "timestamp": "2025-09-05 08:51:18.412481", "step": 260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:18.575862", "step": 260, "epoch": 1 }, { "type": "loss", "content": 0.4201185405254364, "timestamp": "2025-09-05 08:51:18.579922", "step": 261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:18.749512", "step": 261, "epoch": 1 }, { "type": "loss", "content": 0.34828004240989685, "timestamp": "2025-09-05 08:51:18.751937", "step": 262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:18.961324", "step": 262, "epoch": 1 }, { "type": "loss", "content": 0.3680271506309509, "timestamp": "2025-09-05 08:51:18.963334", "step": 263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:19.161929", "step": 263, "epoch": 1 }, { "type": "loss", "content": 0.430779367685318, "timestamp": "2025-09-05 08:51:19.177248", "step": 264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:19.370004", "step": 264, "epoch": 1 }, { "type": "loss", "content": 0.4136819839477539, "timestamp": "2025-09-05 08:51:19.372994", "step": 265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:19.582552", "step": 265, "epoch": 1 }, { "type": "loss", "content": 0.33954256772994995, "timestamp": "2025-09-05 08:51:19.585434", "step": 266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:19.788353", "step": 266, "epoch": 1 }, { "type": "loss", "content": 0.3512232005596161, "timestamp": "2025-09-05 08:51:19.791506", "step": 267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:20.000570", "step": 267, "epoch": 1 }, { "type": "loss", "content": 0.22923022508621216, "timestamp": "2025-09-05 08:51:20.016069", "step": 268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:20.207996", "step": 268, "epoch": 1 }, { "type": "loss", "content": 0.39777079224586487, "timestamp": "2025-09-05 08:51:20.210365", "step": 269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:20.410582", "step": 269, "epoch": 1 }, { "type": "loss", "content": 0.34263524413108826, "timestamp": "2025-09-05 08:51:20.412726", "step": 270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:20.611599", "step": 270, "epoch": 1 }, { "type": "loss", "content": 0.33798688650131226, "timestamp": "2025-09-05 08:51:20.616122", "step": 271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:20.815141", "step": 271, "epoch": 1 }, { "type": "loss", "content": 0.5267437696456909, "timestamp": "2025-09-05 08:51:20.834848", "step": 272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:21.034317", "step": 272, "epoch": 1 }, { "type": "loss", "content": 0.41006460785865784, "timestamp": "2025-09-05 08:51:21.038358", "step": 273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:21.244494", "step": 273, "epoch": 1 }, { "type": "loss", "content": 0.4595869779586792, "timestamp": "2025-09-05 08:51:21.247391", "step": 274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:21.450319", "step": 274, "epoch": 1 }, { "type": "loss", "content": 0.35671982169151306, "timestamp": "2025-09-05 08:51:21.452813", "step": 275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:21.662382", "step": 275, "epoch": 1 }, { "type": "loss", "content": 0.3464692234992981, "timestamp": "2025-09-05 08:51:21.678478", "step": 276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:21.873740", "step": 276, "epoch": 1 }, { "type": "loss", "content": 0.513616144657135, "timestamp": "2025-09-05 08:51:21.876507", "step": 277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:22.086577", "step": 277, "epoch": 1 }, { "type": "loss", "content": 0.3644759953022003, "timestamp": "2025-09-05 08:51:22.090146", "step": 278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:22.295231", "step": 278, "epoch": 1 }, { "type": "loss", "content": 0.30014386773109436, "timestamp": "2025-09-05 08:51:22.302241", "step": 279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:22.516028", "step": 279, "epoch": 1 }, { "type": "loss", "content": 0.3263522684574127, "timestamp": "2025-09-05 08:51:22.532144", "step": 280, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:51:27.228302", "step": 280, "epoch": 1 }, { "type": "pplx", "content": 63.39580635831405, "timestamp": "2025-09-05 08:51:27.231907", "step": 280, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 280", "timestamp": "2025-09-05 08:51:27.737466", "step": 280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:27.903601", "step": 280, "epoch": 1 }, { "type": "loss", "content": 0.2871383726596832, "timestamp": "2025-09-05 08:51:27.905516", "step": 281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:28.113830", "step": 281, "epoch": 1 }, { "type": "loss", "content": 0.5244265198707581, "timestamp": "2025-09-05 08:51:28.116159", "step": 282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:28.287939", "step": 282, "epoch": 1 }, { "type": "loss", "content": 0.30767932534217834, "timestamp": "2025-09-05 08:51:28.290430", "step": 283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:28.498428", "step": 283, "epoch": 1 }, { "type": "loss", "content": 0.42134183645248413, "timestamp": "2025-09-05 08:51:28.516281", "step": 284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:28.708204", "step": 284, "epoch": 1 }, { "type": "loss", "content": 0.3752477467060089, "timestamp": "2025-09-05 08:51:28.710520", "step": 285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:51:28.917749", "step": 285, "epoch": 1 }, { "type": "loss", "content": 0.3297032415866852, "timestamp": "2025-09-05 08:51:28.920190", "step": 286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:29.118017", "step": 286, "epoch": 1 }, { "type": "loss", "content": 0.3052656054496765, "timestamp": "2025-09-05 08:51:29.121103", "step": 287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:29.326871", "step": 287, "epoch": 1 }, { "type": "loss", "content": 0.3355713188648224, "timestamp": "2025-09-05 08:51:29.344871", "step": 288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:29.543183", "step": 288, "epoch": 1 }, { "type": "loss", "content": 0.35573214292526245, "timestamp": "2025-09-05 08:51:29.545838", "step": 289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:29.753080", "step": 289, "epoch": 1 }, { "type": "loss", "content": 0.4543474316596985, "timestamp": "2025-09-05 08:51:29.756576", "step": 290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:29.957195", "step": 290, "epoch": 1 }, { "type": "loss", "content": 0.3964625298976898, "timestamp": "2025-09-05 08:51:29.959974", "step": 291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:30.165709", "step": 291, "epoch": 1 }, { "type": "loss", "content": 0.3353409469127655, "timestamp": "2025-09-05 08:51:30.180114", "step": 292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:51:30.371786", "step": 292, "epoch": 1 }, { "type": "loss", "content": 0.4800986647605896, "timestamp": "2025-09-05 08:51:30.374286", "step": 293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:30.576864", "step": 293, "epoch": 1 }, { "type": "loss", "content": 0.3704434633255005, "timestamp": "2025-09-05 08:51:30.579299", "step": 294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:30.780856", "step": 294, "epoch": 1 }, { "type": "loss", "content": 0.4373602271080017, "timestamp": "2025-09-05 08:51:30.784033", "step": 295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:30.983714", "step": 295, "epoch": 1 }, { "type": "loss", "content": 0.4753832519054413, "timestamp": "2025-09-05 08:51:30.998392", "step": 296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:31.191891", "step": 296, "epoch": 1 }, { "type": "loss", "content": 0.4039275646209717, "timestamp": "2025-09-05 08:51:31.195051", "step": 297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:31.409572", "step": 297, "epoch": 1 }, { "type": "loss", "content": 0.35360774397850037, "timestamp": "2025-09-05 08:51:31.411839", "step": 298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:31.611994", "step": 298, "epoch": 1 }, { "type": "loss", "content": 0.4502357840538025, "timestamp": "2025-09-05 08:51:31.616480", "step": 299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:31.824098", "step": 299, "epoch": 1 }, { "type": "loss", "content": 0.34486469626426697, "timestamp": "2025-09-05 08:51:31.839022", "step": 300, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:51:36.541321", "step": 300, "epoch": 1 }, { "type": "pplx", "content": 62.91448786259217, "timestamp": "2025-09-05 08:51:36.543514", "step": 300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:51:36.706204", "step": 300, "epoch": 1 }, { "type": "loss", "content": 0.42969754338264465, "timestamp": "2025-09-05 08:51:36.708169", "step": 301, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:36.916708", "step": 301, "epoch": 1 }, { "type": "loss", "content": 0.3379324972629547, "timestamp": "2025-09-05 08:51:36.920013", "step": 302, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:37.118930", "step": 302, "epoch": 1 }, { "type": "loss", "content": 0.28274306654930115, "timestamp": "2025-09-05 08:51:37.121827", "step": 303, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:37.330465", "step": 303, "epoch": 1 }, { "type": "loss", "content": 0.3989158570766449, "timestamp": "2025-09-05 08:51:37.344698", "step": 304, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:37.535431", "step": 304, "epoch": 1 }, { "type": "loss", "content": 0.45338067412376404, "timestamp": "2025-09-05 08:51:37.538881", "step": 305, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:51:37.735743", "step": 305, "epoch": 1 }, { "type": "loss", "content": 0.4826226234436035, "timestamp": "2025-09-05 08:51:37.739435", "step": 306, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:37.937478", "step": 306, "epoch": 1 }, { "type": "loss", "content": 0.23507355153560638, "timestamp": "2025-09-05 08:51:37.939847", "step": 307, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:38.148933", "step": 307, "epoch": 1 }, { "type": "loss", "content": 0.41786983609199524, "timestamp": "2025-09-05 08:51:38.165846", "step": 308, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:38.364288", "step": 308, "epoch": 1 }, { "type": "loss", "content": 0.3461589813232422, "timestamp": "2025-09-05 08:51:38.366570", "step": 309, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:38.576965", "step": 309, "epoch": 1 }, { "type": "loss", "content": 0.4844832420349121, "timestamp": "2025-09-05 08:51:38.579372", "step": 310, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:38.787986", "step": 310, "epoch": 1 }, { "type": "loss", "content": 0.38971275091171265, "timestamp": "2025-09-05 08:51:38.790638", "step": 311, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:51:38.997464", "step": 311, "epoch": 1 }, { "type": "loss", "content": 0.474933385848999, "timestamp": "2025-09-05 08:51:39.013582", "step": 312, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:39.206540", "step": 312, "epoch": 1 }, { "type": "loss", "content": 0.37481558322906494, "timestamp": "2025-09-05 08:51:39.210174", "step": 313, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:39.410262", "step": 313, "epoch": 1 }, { "type": "loss", "content": 0.3350926637649536, "timestamp": "2025-09-05 08:51:39.412546", "step": 314, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:39.622336", "step": 314, "epoch": 1 }, { "type": "loss", "content": 0.5339053869247437, "timestamp": "2025-09-05 08:51:39.624825", "step": 315, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:39.836689", "step": 315, "epoch": 1 }, { "type": "loss", "content": 0.3207130432128906, "timestamp": "2025-09-05 08:51:39.851434", "step": 316, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:40.045517", "step": 316, "epoch": 1 }, { "type": "loss", "content": 0.35091298818588257, "timestamp": "2025-09-05 08:51:40.047800", "step": 317, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:40.248682", "step": 317, "epoch": 1 }, { "type": "loss", "content": 0.3151177763938904, "timestamp": "2025-09-05 08:51:40.251910", "step": 318, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:40.448949", "step": 318, "epoch": 1 }, { "type": "loss", "content": 0.2805836796760559, "timestamp": "2025-09-05 08:51:40.453248", "step": 319, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:40.664741", "step": 319, "epoch": 1 }, { "type": "loss", "content": 0.31773483753204346, "timestamp": "2025-09-05 08:51:40.679760", "step": 320, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:51:45.519892", "step": 320, "epoch": 1 }, { "type": "pplx", "content": 62.03072212739932, "timestamp": "2025-09-05 08:51:45.522026", "step": 320, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 320", "timestamp": "2025-09-05 08:51:45.987925", "step": 320, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:46.160525", "step": 320, "epoch": 1 }, { "type": "loss", "content": 0.287524551153183, "timestamp": "2025-09-05 08:51:46.164165", "step": 321, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:46.367278", "step": 321, "epoch": 1 }, { "type": "loss", "content": 0.356285959482193, "timestamp": "2025-09-05 08:51:46.369491", "step": 322, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:46.578658", "step": 322, "epoch": 1 }, { "type": "loss", "content": 0.5080327987670898, "timestamp": "2025-09-05 08:51:46.581762", "step": 323, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:46.786024", "step": 323, "epoch": 1 }, { "type": "loss", "content": 0.3835434019565582, "timestamp": "2025-09-05 08:51:46.803007", "step": 324, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:47.003397", "step": 324, "epoch": 1 }, { "type": "loss", "content": 0.3160017430782318, "timestamp": "2025-09-05 08:51:47.006010", "step": 325, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:47.207660", "step": 325, "epoch": 1 }, { "type": "loss", "content": 0.4292439818382263, "timestamp": "2025-09-05 08:51:47.210036", "step": 326, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 08:51:47.416253", "step": 326, "epoch": 1 }, { "type": "loss", "content": 0.5584763884544373, "timestamp": "2025-09-05 08:51:47.418687", "step": 327, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:47.628242", "step": 327, "epoch": 1 }, { "type": "loss", "content": 0.2741956412792206, "timestamp": "2025-09-05 08:51:47.645050", "step": 328, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:47.846435", "step": 328, "epoch": 1 }, { "type": "loss", "content": 0.45366376638412476, "timestamp": "2025-09-05 08:51:47.849132", "step": 329, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:48.049050", "step": 329, "epoch": 1 }, { "type": "loss", "content": 0.4707634449005127, "timestamp": "2025-09-05 08:51:48.051331", "step": 330, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:48.252308", "step": 330, "epoch": 1 }, { "type": "loss", "content": 0.32083678245544434, "timestamp": "2025-09-05 08:51:48.254709", "step": 331, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:48.453175", "step": 331, "epoch": 1 }, { "type": "loss", "content": 0.23594726622104645, "timestamp": "2025-09-05 08:51:48.469797", "step": 332, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:48.670973", "step": 332, "epoch": 1 }, { "type": "loss", "content": 0.342578649520874, "timestamp": "2025-09-05 08:51:48.673988", "step": 333, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:48.884254", "step": 333, "epoch": 1 }, { "type": "loss", "content": 0.3945893943309784, "timestamp": "2025-09-05 08:51:48.886685", "step": 334, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:49.086662", "step": 334, "epoch": 1 }, { "type": "loss", "content": 0.2937992811203003, "timestamp": "2025-09-05 08:51:49.090876", "step": 335, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:49.288803", "step": 335, "epoch": 1 }, { "type": "loss", "content": 0.3234483003616333, "timestamp": "2025-09-05 08:51:49.305690", "step": 336, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:49.506331", "step": 336, "epoch": 1 }, { "type": "loss", "content": 0.3142837584018707, "timestamp": "2025-09-05 08:51:49.508588", "step": 337, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:51:49.706321", "step": 337, "epoch": 1 }, { "type": "loss", "content": 0.29564931988716125, "timestamp": "2025-09-05 08:51:49.712194", "step": 338, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:49.919033", "step": 338, "epoch": 1 }, { "type": "loss", "content": 0.2818536162376404, "timestamp": "2025-09-05 08:51:49.921396", "step": 339, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:50.129125", "step": 339, "epoch": 1 }, { "type": "loss", "content": 0.31897684931755066, "timestamp": "2025-09-05 08:51:50.145532", "step": 340, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:51:54.884325", "step": 340, "epoch": 1 }, { "type": "pplx", "content": 61.25419831481063, "timestamp": "2025-09-05 08:51:54.886751", "step": 340, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:55.051072", "step": 340, "epoch": 1 }, { "type": "loss", "content": 0.3685891926288605, "timestamp": "2025-09-05 08:51:55.054219", "step": 341, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:55.265043", "step": 341, "epoch": 1 }, { "type": "loss", "content": 0.4979967176914215, "timestamp": "2025-09-05 08:51:55.267081", "step": 342, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:55.467739", "step": 342, "epoch": 1 }, { "type": "loss", "content": 0.4489175081253052, "timestamp": "2025-09-05 08:51:55.470109", "step": 343, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:55.671335", "step": 343, "epoch": 1 }, { "type": "loss", "content": 0.42277976870536804, "timestamp": "2025-09-05 08:51:55.686218", "step": 344, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:55.877423", "step": 344, "epoch": 1 }, { "type": "loss", "content": 0.26805245876312256, "timestamp": "2025-09-05 08:51:55.881086", "step": 345, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:51:56.083943", "step": 345, "epoch": 1 }, { "type": "loss", "content": 0.4023347496986389, "timestamp": "2025-09-05 08:51:56.086884", "step": 346, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:56.285593", "step": 346, "epoch": 1 }, { "type": "loss", "content": 0.21793437004089355, "timestamp": "2025-09-05 08:51:56.287738", "step": 347, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:56.497434", "step": 347, "epoch": 1 }, { "type": "loss", "content": 0.42367270588874817, "timestamp": "2025-09-05 08:51:56.514045", "step": 348, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:56.714891", "step": 348, "epoch": 1 }, { "type": "loss", "content": 0.19184860587120056, "timestamp": "2025-09-05 08:51:56.718750", "step": 349, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:56.916934", "step": 349, "epoch": 1 }, { "type": "loss", "content": 0.4862029254436493, "timestamp": "2025-09-05 08:51:56.919071", "step": 350, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:57.119662", "step": 350, "epoch": 1 }, { "type": "loss", "content": 0.39638984203338623, "timestamp": "2025-09-05 08:51:57.123211", "step": 351, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:57.321992", "step": 351, "epoch": 1 }, { "type": "loss", "content": 0.44743719696998596, "timestamp": "2025-09-05 08:51:57.337160", "step": 352, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:57.531589", "step": 352, "epoch": 1 }, { "type": "loss", "content": 0.3436514139175415, "timestamp": "2025-09-05 08:51:57.533859", "step": 353, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:57.733979", "step": 353, "epoch": 1 }, { "type": "loss", "content": 0.5128458142280579, "timestamp": "2025-09-05 08:51:57.736308", "step": 354, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:57.936356", "step": 354, "epoch": 1 }, { "type": "loss", "content": 0.38132208585739136, "timestamp": "2025-09-05 08:51:57.938784", "step": 355, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:58.151174", "step": 355, "epoch": 1 }, { "type": "loss", "content": 0.4043236970901489, "timestamp": "2025-09-05 08:51:58.166887", "step": 356, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:51:58.368726", "step": 356, "epoch": 1 }, { "type": "loss", "content": 0.4835319221019745, "timestamp": "2025-09-05 08:51:58.371395", "step": 357, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:51:58.579671", "step": 357, "epoch": 1 }, { "type": "loss", "content": 0.4353181719779968, "timestamp": "2025-09-05 08:51:58.581758", "step": 358, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:51:58.792551", "step": 358, "epoch": 1 }, { "type": "loss", "content": 0.532089352607727, "timestamp": "2025-09-05 08:51:58.795005", "step": 359, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:51:59.005467", "step": 359, "epoch": 1 }, { "type": "loss", "content": 0.36872991919517517, "timestamp": "2025-09-05 08:51:59.022067", "step": 360, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:52:03.741679", "step": 360, "epoch": 1 }, { "type": "pplx", "content": 60.85764402253067, "timestamp": "2025-09-05 08:52:03.744697", "step": 360, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 360", "timestamp": "2025-09-05 08:52:04.267463", "step": 360, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:04.467076", "step": 360, "epoch": 1 }, { "type": "loss", "content": 0.4337652027606964, "timestamp": "2025-09-05 08:52:04.469790", "step": 361, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:04.669343", "step": 361, "epoch": 1 }, { "type": "loss", "content": 0.3073671758174896, "timestamp": "2025-09-05 08:52:04.672067", "step": 362, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:04.870495", "step": 362, "epoch": 1 }, { "type": "loss", "content": 0.3424108922481537, "timestamp": "2025-09-05 08:52:04.872776", "step": 363, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:05.080213", "step": 363, "epoch": 1 }, { "type": "loss", "content": 0.4130728542804718, "timestamp": "2025-09-05 08:52:05.097032", "step": 364, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:05.296805", "step": 364, "epoch": 1 }, { "type": "loss", "content": 0.5122348666191101, "timestamp": "2025-09-05 08:52:05.299866", "step": 365, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:05.512098", "step": 365, "epoch": 1 }, { "type": "loss", "content": 0.35365021228790283, "timestamp": "2025-09-05 08:52:05.514520", "step": 366, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:05.725698", "step": 366, "epoch": 1 }, { "type": "loss", "content": 0.3705803155899048, "timestamp": "2025-09-05 08:52:05.728166", "step": 367, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:05.927508", "step": 367, "epoch": 1 }, { "type": "loss", "content": 0.31572356820106506, "timestamp": "2025-09-05 08:52:05.943003", "step": 368, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:06.136877", "step": 368, "epoch": 1 }, { "type": "loss", "content": 0.484190434217453, "timestamp": "2025-09-05 08:52:06.139123", "step": 369, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:06.338365", "step": 369, "epoch": 1 }, { "type": "loss", "content": 0.33856910467147827, "timestamp": "2025-09-05 08:52:06.340412", "step": 370, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:52:06.548382", "step": 370, "epoch": 1 }, { "type": "loss", "content": 0.33974528312683105, "timestamp": "2025-09-05 08:52:06.551882", "step": 371, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:06.725021", "step": 371, "epoch": 1 }, { "type": "loss", "content": 0.28406357765197754, "timestamp": "2025-09-05 08:52:06.734842", "step": 372, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:06.902028", "step": 372, "epoch": 1 }, { "type": "loss", "content": 0.4505447447299957, "timestamp": "2025-09-05 08:52:06.904266", "step": 373, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:07.072962", "step": 373, "epoch": 1 }, { "type": "loss", "content": 0.41192686557769775, "timestamp": "2025-09-05 08:52:07.075418", "step": 374, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:07.259228", "step": 374, "epoch": 1 }, { "type": "loss", "content": 0.48100942373275757, "timestamp": "2025-09-05 08:52:07.261588", "step": 375, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:07.442104", "step": 375, "epoch": 1 }, { "type": "loss", "content": 0.4199207127094269, "timestamp": "2025-09-05 08:52:07.451785", "step": 376, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:07.618469", "step": 376, "epoch": 1 }, { "type": "loss", "content": 0.44891834259033203, "timestamp": "2025-09-05 08:52:07.620478", "step": 377, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:07.794549", "step": 377, "epoch": 1 }, { "type": "loss", "content": 0.38180747628211975, "timestamp": "2025-09-05 08:52:07.796896", "step": 378, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:07.967300", "step": 378, "epoch": 1 }, { "type": "loss", "content": 0.4493151307106018, "timestamp": "2025-09-05 08:52:07.970670", "step": 379, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:08.151498", "step": 379, "epoch": 1 }, { "type": "loss", "content": 0.3687281906604767, "timestamp": "2025-09-05 08:52:08.161079", "step": 380, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:52:12.875645", "step": 380, "epoch": 1 }, { "type": "pplx", "content": 60.50173690886968, "timestamp": "2025-09-05 08:52:12.879267", "step": 380, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:13.045039", "step": 380, "epoch": 1 }, { "type": "loss", "content": 0.24862390756607056, "timestamp": "2025-09-05 08:52:13.050670", "step": 381, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:13.251042", "step": 381, "epoch": 1 }, { "type": "loss", "content": 0.3486191928386688, "timestamp": "2025-09-05 08:52:13.253890", "step": 382, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:13.465124", "step": 382, "epoch": 1 }, { "type": "loss", "content": 0.3658300042152405, "timestamp": "2025-09-05 08:52:13.467841", "step": 383, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:13.675325", "step": 383, "epoch": 1 }, { "type": "loss", "content": 0.31089478731155396, "timestamp": "2025-09-05 08:52:13.693652", "step": 384, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:13.879125", "step": 384, "epoch": 1 }, { "type": "loss", "content": 0.3752908408641815, "timestamp": "2025-09-05 08:52:13.881370", "step": 385, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:14.090355", "step": 385, "epoch": 1 }, { "type": "loss", "content": 0.5100250244140625, "timestamp": "2025-09-05 08:52:14.092533", "step": 386, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:14.301671", "step": 386, "epoch": 1 }, { "type": "loss", "content": 0.5159087181091309, "timestamp": "2025-09-05 08:52:14.307073", "step": 387, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:14.505032", "step": 387, "epoch": 1 }, { "type": "loss", "content": 0.4958210289478302, "timestamp": "2025-09-05 08:52:14.527608", "step": 388, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:14.729709", "step": 388, "epoch": 1 }, { "type": "loss", "content": 0.47933509945869446, "timestamp": "2025-09-05 08:52:14.732227", "step": 389, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:14.931821", "step": 389, "epoch": 1 }, { "type": "loss", "content": 0.4100204110145569, "timestamp": "2025-09-05 08:52:14.936678", "step": 390, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:15.148605", "step": 390, "epoch": 1 }, { "type": "loss", "content": 0.4352912902832031, "timestamp": "2025-09-05 08:52:15.151091", "step": 391, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:15.349293", "step": 391, "epoch": 1 }, { "type": "loss", "content": 0.39970317482948303, "timestamp": "2025-09-05 08:52:15.365860", "step": 392, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:15.567312", "step": 392, "epoch": 1 }, { "type": "loss", "content": 0.3486652076244354, "timestamp": "2025-09-05 08:52:15.569587", "step": 393, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:15.779892", "step": 393, "epoch": 1 }, { "type": "loss", "content": 0.4482623040676117, "timestamp": "2025-09-05 08:52:15.781888", "step": 394, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:52:15.955997", "step": 394, "epoch": 1 }, { "type": "loss", "content": 0.3497057557106018, "timestamp": "2025-09-05 08:52:15.959493", "step": 395, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:16.167562", "step": 395, "epoch": 1 }, { "type": "loss", "content": 0.2155306488275528, "timestamp": "2025-09-05 08:52:16.181975", "step": 396, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:16.374926", "step": 396, "epoch": 1 }, { "type": "loss", "content": 0.2984706163406372, "timestamp": "2025-09-05 08:52:16.377026", "step": 397, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:16.578129", "step": 397, "epoch": 1 }, { "type": "loss", "content": 0.4520842134952545, "timestamp": "2025-09-05 08:52:16.580222", "step": 398, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:16.787897", "step": 398, "epoch": 1 }, { "type": "loss", "content": 0.4184480905532837, "timestamp": "2025-09-05 08:52:16.791025", "step": 399, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:17.000795", "step": 399, "epoch": 1 }, { "type": "loss", "content": 0.391579806804657, "timestamp": "2025-09-05 08:52:17.016163", "step": 400, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:52:21.710130", "step": 400, "epoch": 1 }, { "type": "pplx", "content": 59.69678279140599, "timestamp": "2025-09-05 08:52:21.712252", "step": 400, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 400", "timestamp": "2025-09-05 08:52:22.212048", "step": 400, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:22.382310", "step": 400, "epoch": 1 }, { "type": "loss", "content": 0.5822342038154602, "timestamp": "2025-09-05 08:52:22.384599", "step": 401, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:22.583471", "step": 401, "epoch": 1 }, { "type": "loss", "content": 0.26364198327064514, "timestamp": "2025-09-05 08:52:22.585575", "step": 402, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:22.786247", "step": 402, "epoch": 1 }, { "type": "loss", "content": 0.415432333946228, "timestamp": "2025-09-05 08:52:22.788568", "step": 403, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:22.998089", "step": 403, "epoch": 1 }, { "type": "loss", "content": 0.43436604738235474, "timestamp": "2025-09-05 08:52:23.014811", "step": 404, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:23.218248", "step": 404, "epoch": 1 }, { "type": "loss", "content": 0.33526358008384705, "timestamp": "2025-09-05 08:52:23.220827", "step": 405, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:52:23.419773", "step": 405, "epoch": 1 }, { "type": "loss", "content": 0.3843635022640228, "timestamp": "2025-09-05 08:52:23.423685", "step": 406, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:23.622251", "step": 406, "epoch": 1 }, { "type": "loss", "content": 0.3481503427028656, "timestamp": "2025-09-05 08:52:23.625075", "step": 407, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:23.833551", "step": 407, "epoch": 1 }, { "type": "loss", "content": 0.400000661611557, "timestamp": "2025-09-05 08:52:23.847684", "step": 408, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:24.040538", "step": 408, "epoch": 1 }, { "type": "loss", "content": 0.3273821473121643, "timestamp": "2025-09-05 08:52:24.042628", "step": 409, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:24.251973", "step": 409, "epoch": 1 }, { "type": "loss", "content": 0.29011961817741394, "timestamp": "2025-09-05 08:52:24.254121", "step": 410, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:24.454853", "step": 410, "epoch": 1 }, { "type": "loss", "content": 0.4190179407596588, "timestamp": "2025-09-05 08:52:24.457738", "step": 411, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:24.655875", "step": 411, "epoch": 1 }, { "type": "loss", "content": 0.3011251389980316, "timestamp": "2025-09-05 08:52:24.670472", "step": 412, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:24.865142", "step": 412, "epoch": 1 }, { "type": "loss", "content": 0.34651055932044983, "timestamp": "2025-09-05 08:52:24.869347", "step": 413, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:25.068331", "step": 413, "epoch": 1 }, { "type": "loss", "content": 0.4374030530452728, "timestamp": "2025-09-05 08:52:25.071673", "step": 414, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:25.275075", "step": 414, "epoch": 1 }, { "type": "loss", "content": 0.3524605929851532, "timestamp": "2025-09-05 08:52:25.279079", "step": 415, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:25.476558", "step": 415, "epoch": 1 }, { "type": "loss", "content": 0.36618897318840027, "timestamp": "2025-09-05 08:52:25.491665", "step": 416, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:52:25.683339", "step": 416, "epoch": 1 }, { "type": "loss", "content": 0.3304573893547058, "timestamp": "2025-09-05 08:52:25.685271", "step": 417, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:25.893443", "step": 417, "epoch": 1 }, { "type": "loss", "content": 0.2957325577735901, "timestamp": "2025-09-05 08:52:25.895735", "step": 418, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:26.104484", "step": 418, "epoch": 1 }, { "type": "loss", "content": 0.3814864158630371, "timestamp": "2025-09-05 08:52:26.107123", "step": 419, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:26.312068", "step": 419, "epoch": 1 }, { "type": "loss", "content": 0.3151036202907562, "timestamp": "2025-09-05 08:52:26.328851", "step": 420, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:52:31.036153", "step": 420, "epoch": 1 }, { "type": "pplx", "content": 59.35530192163852, "timestamp": "2025-09-05 08:52:31.038978", "step": 420, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:31.210366", "step": 420, "epoch": 1 }, { "type": "loss", "content": 0.3436422348022461, "timestamp": "2025-09-05 08:52:31.216851", "step": 421, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:31.427152", "step": 421, "epoch": 1 }, { "type": "loss", "content": 0.24939358234405518, "timestamp": "2025-09-05 08:52:31.429321", "step": 422, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:31.603677", "step": 422, "epoch": 1 }, { "type": "loss", "content": 0.2410716712474823, "timestamp": "2025-09-05 08:52:31.605963", "step": 423, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:31.775227", "step": 423, "epoch": 1 }, { "type": "loss", "content": 0.4125831723213196, "timestamp": "2025-09-05 08:52:31.792232", "step": 424, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:31.995860", "step": 424, "epoch": 1 }, { "type": "loss", "content": 0.3474792242050171, "timestamp": "2025-09-05 08:52:31.998205", "step": 425, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:32.198239", "step": 425, "epoch": 1 }, { "type": "loss", "content": 0.31995099782943726, "timestamp": "2025-09-05 08:52:32.200884", "step": 426, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:32.413494", "step": 426, "epoch": 1 }, { "type": "loss", "content": 0.3391028046607971, "timestamp": "2025-09-05 08:52:32.416831", "step": 427, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:52:32.626920", "step": 427, "epoch": 1 }, { "type": "loss", "content": 0.38398104906082153, "timestamp": "2025-09-05 08:52:32.643118", "step": 428, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:32.839796", "step": 428, "epoch": 1 }, { "type": "loss", "content": 0.4830264151096344, "timestamp": "2025-09-05 08:52:32.843610", "step": 429, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:33.096484", "step": 429, "epoch": 1 }, { "type": "loss", "content": 0.38072633743286133, "timestamp": "2025-09-05 08:52:33.098709", "step": 430, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:33.350670", "step": 430, "epoch": 1 }, { "type": "loss", "content": 0.35430270433425903, "timestamp": "2025-09-05 08:52:33.393391", "step": 431, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:33.603830", "step": 431, "epoch": 1 }, { "type": "loss", "content": 0.33442458510398865, "timestamp": "2025-09-05 08:52:33.618554", "step": 432, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:33.810928", "step": 432, "epoch": 1 }, { "type": "loss", "content": 0.48529914021492004, "timestamp": "2025-09-05 08:52:33.814095", "step": 433, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:34.016352", "step": 433, "epoch": 1 }, { "type": "loss", "content": 0.414386510848999, "timestamp": "2025-09-05 08:52:34.018901", "step": 434, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:34.228728", "step": 434, "epoch": 1 }, { "type": "loss", "content": 0.5347086191177368, "timestamp": "2025-09-05 08:52:34.231277", "step": 435, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:34.430951", "step": 435, "epoch": 1 }, { "type": "loss", "content": 0.3398420214653015, "timestamp": "2025-09-05 08:52:34.446031", "step": 436, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:34.681402", "step": 436, "epoch": 1 }, { "type": "loss", "content": 0.33175885677337646, "timestamp": "2025-09-05 08:52:34.683884", "step": 437, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:34.891436", "step": 437, "epoch": 1 }, { "type": "loss", "content": 0.30734267830848694, "timestamp": "2025-09-05 08:52:34.894720", "step": 438, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:35.093238", "step": 438, "epoch": 1 }, { "type": "loss", "content": 0.2820775508880615, "timestamp": "2025-09-05 08:52:35.096035", "step": 439, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:35.389076", "step": 439, "epoch": 1 }, { "type": "loss", "content": 0.3953246772289276, "timestamp": "2025-09-05 08:52:35.405738", "step": 440, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:52:40.619422", "step": 440, "epoch": 1 }, { "type": "pplx", "content": 59.06938762912424, "timestamp": "2025-09-05 08:52:40.622034", "step": 440, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 440", "timestamp": "2025-09-05 08:52:41.124812", "step": 440, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:41.337105", "step": 440, "epoch": 1 }, { "type": "loss", "content": 0.428076833486557, "timestamp": "2025-09-05 08:52:41.362204", "step": 441, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:41.616790", "step": 441, "epoch": 1 }, { "type": "loss", "content": 0.30740123987197876, "timestamp": "2025-09-05 08:52:41.619651", "step": 442, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:41.829409", "step": 442, "epoch": 1 }, { "type": "loss", "content": 0.3571283221244812, "timestamp": "2025-09-05 08:52:41.831738", "step": 443, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:42.030375", "step": 443, "epoch": 1 }, { "type": "loss", "content": 0.3677189350128174, "timestamp": "2025-09-05 08:52:42.044872", "step": 444, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:42.237303", "step": 444, "epoch": 1 }, { "type": "loss", "content": 0.2622219920158386, "timestamp": "2025-09-05 08:52:42.240974", "step": 445, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:42.449712", "step": 445, "epoch": 1 }, { "type": "loss", "content": 0.3243364989757538, "timestamp": "2025-09-05 08:52:42.452727", "step": 446, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:52:42.704565", "step": 446, "epoch": 1 }, { "type": "loss", "content": 0.2399972379207611, "timestamp": "2025-09-05 08:52:42.706805", "step": 447, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:42.906947", "step": 447, "epoch": 1 }, { "type": "loss", "content": 0.37382861971855164, "timestamp": "2025-09-05 08:52:42.921140", "step": 448, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:43.115262", "step": 448, "epoch": 1 }, { "type": "loss", "content": 0.42507728934288025, "timestamp": "2025-09-05 08:52:43.118670", "step": 449, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:52:43.370405", "step": 449, "epoch": 1 }, { "type": "loss", "content": 0.30817678570747375, "timestamp": "2025-09-05 08:52:43.373234", "step": 450, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:43.575061", "step": 450, "epoch": 1 }, { "type": "loss", "content": 0.3689974248409271, "timestamp": "2025-09-05 08:52:43.581882", "step": 451, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:43.786456", "step": 451, "epoch": 1 }, { "type": "loss", "content": 0.3893755078315735, "timestamp": "2025-09-05 08:52:43.803301", "step": 452, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:44.068521", "step": 452, "epoch": 1 }, { "type": "loss", "content": 0.4274936020374298, "timestamp": "2025-09-05 08:52:44.071142", "step": 453, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:44.279260", "step": 453, "epoch": 1 }, { "type": "loss", "content": 0.24627014994621277, "timestamp": "2025-09-05 08:52:44.281140", "step": 454, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:44.488500", "step": 454, "epoch": 1 }, { "type": "loss", "content": 0.42970582842826843, "timestamp": "2025-09-05 08:52:44.495746", "step": 455, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:44.698466", "step": 455, "epoch": 1 }, { "type": "loss", "content": 0.4055229127407074, "timestamp": "2025-09-05 08:52:44.716271", "step": 456, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:44.910316", "step": 456, "epoch": 1 }, { "type": "loss", "content": 0.3416372537612915, "timestamp": "2025-09-05 08:52:44.927124", "step": 457, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:52:45.181071", "step": 457, "epoch": 1 }, { "type": "loss", "content": 0.2750747799873352, "timestamp": "2025-09-05 08:52:45.183843", "step": 458, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:45.394187", "step": 458, "epoch": 1 }, { "type": "loss", "content": 0.3312303125858307, "timestamp": "2025-09-05 08:52:45.396553", "step": 459, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:52:45.594916", "step": 459, "epoch": 1 }, { "type": "loss", "content": 0.4751344919204712, "timestamp": "2025-09-05 08:52:45.612023", "step": 460, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:52:51.067022", "step": 460, "epoch": 1 }, { "type": "pplx", "content": 59.67811926915719, "timestamp": "2025-09-05 08:52:51.069230", "step": 460, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:51.234053", "step": 460, "epoch": 1 }, { "type": "loss", "content": 0.3601231575012207, "timestamp": "2025-09-05 08:52:51.236726", "step": 461, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:51.451554", "step": 461, "epoch": 1 }, { "type": "loss", "content": 0.3900015354156494, "timestamp": "2025-09-05 08:52:51.459374", "step": 462, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:51.668075", "step": 462, "epoch": 1 }, { "type": "loss", "content": 0.355674684047699, "timestamp": "2025-09-05 08:52:51.676778", "step": 463, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:51.889242", "step": 463, "epoch": 1 }, { "type": "loss", "content": 0.30925506353378296, "timestamp": "2025-09-05 08:52:51.913685", "step": 464, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:52.111744", "step": 464, "epoch": 1 }, { "type": "loss", "content": 0.41809117794036865, "timestamp": "2025-09-05 08:52:52.114550", "step": 465, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:52.335190", "step": 465, "epoch": 1 }, { "type": "loss", "content": 0.31774064898490906, "timestamp": "2025-09-05 08:52:52.339915", "step": 466, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:52.548704", "step": 466, "epoch": 1 }, { "type": "loss", "content": 0.22886255383491516, "timestamp": "2025-09-05 08:52:52.593754", "step": 467, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:52.803549", "step": 467, "epoch": 1 }, { "type": "loss", "content": 0.3445352017879486, "timestamp": "2025-09-05 08:52:52.822863", "step": 468, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:53.018136", "step": 468, "epoch": 1 }, { "type": "loss", "content": 0.41649383306503296, "timestamp": "2025-09-05 08:52:53.020410", "step": 469, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:53.225503", "step": 469, "epoch": 1 }, { "type": "loss", "content": 0.5665808320045471, "timestamp": "2025-09-05 08:52:53.228809", "step": 470, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:53.431781", "step": 470, "epoch": 1 }, { "type": "loss", "content": 0.4123092293739319, "timestamp": "2025-09-05 08:52:53.433893", "step": 471, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:53.636862", "step": 471, "epoch": 1 }, { "type": "loss", "content": 0.4420497417449951, "timestamp": "2025-09-05 08:52:53.650920", "step": 472, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:53.845865", "step": 472, "epoch": 1 }, { "type": "loss", "content": 0.35629701614379883, "timestamp": "2025-09-05 08:52:53.850369", "step": 473, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:54.163112", "step": 473, "epoch": 1 }, { "type": "loss", "content": 0.3124181926250458, "timestamp": "2025-09-05 08:52:54.206389", "step": 474, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:52:54.461222", "step": 474, "epoch": 1 }, { "type": "loss", "content": 0.5296250581741333, "timestamp": "2025-09-05 08:52:54.463422", "step": 475, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:52:54.680325", "step": 475, "epoch": 1 }, { "type": "loss", "content": 0.42064934968948364, "timestamp": "2025-09-05 08:52:54.706467", "step": 476, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:54.892954", "step": 476, "epoch": 1 }, { "type": "loss", "content": 0.37133467197418213, "timestamp": "2025-09-05 08:52:54.898729", "step": 477, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:52:55.097691", "step": 477, "epoch": 1 }, { "type": "loss", "content": 0.311069518327713, "timestamp": "2025-09-05 08:52:55.100288", "step": 478, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:52:55.397284", "step": 478, "epoch": 1 }, { "type": "loss", "content": 0.174685537815094, "timestamp": "2025-09-05 08:52:55.401764", "step": 479, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:52:55.657892", "step": 479, "epoch": 1 }, { "type": "loss", "content": 0.4376969337463379, "timestamp": "2025-09-05 08:52:55.674695", "step": 480, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:53:01.067886", "step": 480, "epoch": 1 }, { "type": "pplx", "content": 60.60817761059333, "timestamp": "2025-09-05 08:53:01.070163", "step": 480, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 480", "timestamp": "2025-09-05 08:53:01.614052", "step": 480, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:01.877643", "step": 480, "epoch": 1 }, { "type": "loss", "content": 0.3756929636001587, "timestamp": "2025-09-05 08:53:01.883127", "step": 481, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:02.085810", "step": 481, "epoch": 1 }, { "type": "loss", "content": 0.39133283495903015, "timestamp": "2025-09-05 08:53:02.088041", "step": 482, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:02.296584", "step": 482, "epoch": 1 }, { "type": "loss", "content": 0.2793099284172058, "timestamp": "2025-09-05 08:53:02.303765", "step": 483, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:02.505169", "step": 483, "epoch": 1 }, { "type": "loss", "content": 0.3743014335632324, "timestamp": "2025-09-05 08:53:02.519161", "step": 484, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:02.754265", "step": 484, "epoch": 1 }, { "type": "loss", "content": 0.3891092538833618, "timestamp": "2025-09-05 08:53:02.798142", "step": 485, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:03.003319", "step": 485, "epoch": 1 }, { "type": "loss", "content": 0.377437561750412, "timestamp": "2025-09-05 08:53:03.005920", "step": 486, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:03.218782", "step": 486, "epoch": 1 }, { "type": "loss", "content": 0.2652597725391388, "timestamp": "2025-09-05 08:53:03.223824", "step": 487, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:03.435831", "step": 487, "epoch": 1 }, { "type": "loss", "content": 0.42560693621635437, "timestamp": "2025-09-05 08:53:03.453133", "step": 488, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:03.677906", "step": 488, "epoch": 1 }, { "type": "loss", "content": 0.42667150497436523, "timestamp": "2025-09-05 08:53:03.681745", "step": 489, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:03.887673", "step": 489, "epoch": 1 }, { "type": "loss", "content": 0.21720796823501587, "timestamp": "2025-09-05 08:53:03.890582", "step": 490, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:04.088914", "step": 490, "epoch": 1 }, { "type": "loss", "content": 0.2846270203590393, "timestamp": "2025-09-05 08:53:04.090855", "step": 491, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:04.289239", "step": 491, "epoch": 1 }, { "type": "loss", "content": 0.420718789100647, "timestamp": "2025-09-05 08:53:04.306191", "step": 492, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:04.504965", "step": 492, "epoch": 1 }, { "type": "loss", "content": 0.4062350392341614, "timestamp": "2025-09-05 08:53:04.520504", "step": 493, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:04.812004", "step": 493, "epoch": 1 }, { "type": "loss", "content": 0.36269500851631165, "timestamp": "2025-09-05 08:53:04.822831", "step": 494, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:05.030379", "step": 494, "epoch": 1 }, { "type": "loss", "content": 0.40626439452171326, "timestamp": "2025-09-05 08:53:05.032308", "step": 495, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:05.239160", "step": 495, "epoch": 1 }, { "type": "loss", "content": 0.39664778113365173, "timestamp": "2025-09-05 08:53:05.257545", "step": 496, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:05.455355", "step": 496, "epoch": 1 }, { "type": "loss", "content": 0.4610913395881653, "timestamp": "2025-09-05 08:53:05.459367", "step": 497, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:53:05.670320", "step": 497, "epoch": 1 }, { "type": "loss", "content": 0.3041105568408966, "timestamp": "2025-09-05 08:53:05.675017", "step": 498, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:05.885044", "step": 498, "epoch": 1 }, { "type": "loss", "content": 0.36558741331100464, "timestamp": "2025-09-05 08:53:05.887386", "step": 499, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:06.094042", "step": 499, "epoch": 1 }, { "type": "loss", "content": 0.29611101746559143, "timestamp": "2025-09-05 08:53:06.108327", "step": 500, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:53:10.952066", "step": 500, "epoch": 1 }, { "type": "pplx", "content": 61.06419594850436, "timestamp": "2025-09-05 08:53:10.955248", "step": 500, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:11.116573", "step": 500, "epoch": 1 }, { "type": "loss", "content": 0.3699820637702942, "timestamp": "2025-09-05 08:53:11.120663", "step": 501, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:11.289895", "step": 501, "epoch": 1 }, { "type": "loss", "content": 0.3068196177482605, "timestamp": "2025-09-05 08:53:11.292064", "step": 502, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:11.497515", "step": 502, "epoch": 1 }, { "type": "loss", "content": 0.401764839887619, "timestamp": "2025-09-05 08:53:11.499413", "step": 503, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:11.694902", "step": 503, "epoch": 1 }, { "type": "loss", "content": 0.3740246295928955, "timestamp": "2025-09-05 08:53:11.710379", "step": 504, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:11.902108", "step": 504, "epoch": 1 }, { "type": "loss", "content": 0.3317362368106842, "timestamp": "2025-09-05 08:53:11.904239", "step": 505, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:12.110463", "step": 505, "epoch": 1 }, { "type": "loss", "content": 0.32889097929000854, "timestamp": "2025-09-05 08:53:12.112806", "step": 506, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:12.314073", "step": 506, "epoch": 1 }, { "type": "loss", "content": 0.30558574199676514, "timestamp": "2025-09-05 08:53:12.316155", "step": 507, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:12.524019", "step": 507, "epoch": 1 }, { "type": "loss", "content": 0.4227554500102997, "timestamp": "2025-09-05 08:53:12.541949", "step": 508, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:12.742901", "step": 508, "epoch": 1 }, { "type": "loss", "content": 0.31141963601112366, "timestamp": "2025-09-05 08:53:12.745837", "step": 509, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:12.951102", "step": 509, "epoch": 1 }, { "type": "loss", "content": 0.3356727659702301, "timestamp": "2025-09-05 08:53:12.953827", "step": 510, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:13.155485", "step": 510, "epoch": 1 }, { "type": "loss", "content": 0.27994629740715027, "timestamp": "2025-09-05 08:53:13.158290", "step": 511, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:13.365905", "step": 511, "epoch": 1 }, { "type": "loss", "content": 0.40879178047180176, "timestamp": "2025-09-05 08:53:13.382155", "step": 512, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:13.596253", "step": 512, "epoch": 1 }, { "type": "loss", "content": 0.42021042108535767, "timestamp": "2025-09-05 08:53:13.598287", "step": 513, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:13.810567", "step": 513, "epoch": 1 }, { "type": "loss", "content": 0.47443243861198425, "timestamp": "2025-09-05 08:53:13.814212", "step": 514, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:14.020899", "step": 514, "epoch": 1 }, { "type": "loss", "content": 0.47561585903167725, "timestamp": "2025-09-05 08:53:14.024098", "step": 515, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:14.276776", "step": 515, "epoch": 1 }, { "type": "loss", "content": 0.3374590277671814, "timestamp": "2025-09-05 08:53:14.291744", "step": 516, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:14.484079", "step": 516, "epoch": 1 }, { "type": "loss", "content": 0.3174070715904236, "timestamp": "2025-09-05 08:53:14.487904", "step": 517, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:14.688330", "step": 517, "epoch": 1 }, { "type": "loss", "content": 0.5130405426025391, "timestamp": "2025-09-05 08:53:14.705020", "step": 518, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:14.916635", "step": 518, "epoch": 1 }, { "type": "loss", "content": 0.44266101717948914, "timestamp": "2025-09-05 08:53:14.921476", "step": 519, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:15.124075", "step": 519, "epoch": 1 }, { "type": "loss", "content": 0.307713121175766, "timestamp": "2025-09-05 08:53:15.140877", "step": 520, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:53:20.516080", "step": 520, "epoch": 1 }, { "type": "pplx", "content": 60.16279337842755, "timestamp": "2025-09-05 08:53:20.518402", "step": 520, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 520", "timestamp": "2025-09-05 08:53:21.039725", "step": 520, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:21.209774", "step": 520, "epoch": 1 }, { "type": "loss", "content": 0.4752398729324341, "timestamp": "2025-09-05 08:53:21.211928", "step": 521, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:21.467367", "step": 521, "epoch": 1 }, { "type": "loss", "content": 0.3615381419658661, "timestamp": "2025-09-05 08:53:21.469896", "step": 522, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:21.740291", "step": 522, "epoch": 1 }, { "type": "loss", "content": 0.3906700313091278, "timestamp": "2025-09-05 08:53:21.742462", "step": 523, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:21.955790", "step": 523, "epoch": 1 }, { "type": "loss", "content": 0.40926316380500793, "timestamp": "2025-09-05 08:53:21.972562", "step": 524, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:22.163717", "step": 524, "epoch": 1 }, { "type": "loss", "content": 0.3688322603702545, "timestamp": "2025-09-05 08:53:22.167749", "step": 525, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:22.418586", "step": 525, "epoch": 1 }, { "type": "loss", "content": 0.30737367272377014, "timestamp": "2025-09-05 08:53:22.420423", "step": 526, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:22.627783", "step": 526, "epoch": 1 }, { "type": "loss", "content": 0.3638547956943512, "timestamp": "2025-09-05 08:53:22.632266", "step": 527, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:22.840299", "step": 527, "epoch": 1 }, { "type": "loss", "content": 0.3204697072505951, "timestamp": "2025-09-05 08:53:22.858692", "step": 528, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:23.129245", "step": 528, "epoch": 1 }, { "type": "loss", "content": 0.33960676193237305, "timestamp": "2025-09-05 08:53:23.131263", "step": 529, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:23.336988", "step": 529, "epoch": 1 }, { "type": "loss", "content": 0.26278024911880493, "timestamp": "2025-09-05 08:53:23.343774", "step": 530, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:23.545603", "step": 530, "epoch": 1 }, { "type": "loss", "content": 0.46334969997406006, "timestamp": "2025-09-05 08:53:23.547858", "step": 531, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:23.748378", "step": 531, "epoch": 1 }, { "type": "loss", "content": 0.3730030059814453, "timestamp": "2025-09-05 08:53:23.763616", "step": 532, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:23.955681", "step": 532, "epoch": 1 }, { "type": "loss", "content": 0.3750768303871155, "timestamp": "2025-09-05 08:53:23.958740", "step": 533, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:24.159708", "step": 533, "epoch": 1 }, { "type": "loss", "content": 0.4468510150909424, "timestamp": "2025-09-05 08:53:24.162295", "step": 534, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:24.358945", "step": 534, "epoch": 1 }, { "type": "loss", "content": 0.33164605498313904, "timestamp": "2025-09-05 08:53:24.361476", "step": 535, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:24.560635", "step": 535, "epoch": 1 }, { "type": "loss", "content": 0.41563618183135986, "timestamp": "2025-09-05 08:53:24.583314", "step": 536, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:24.783434", "step": 536, "epoch": 1 }, { "type": "loss", "content": 0.2689324915409088, "timestamp": "2025-09-05 08:53:24.786279", "step": 537, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 08:53:24.996809", "step": 537, "epoch": 1 }, { "type": "loss", "content": 0.4886358678340912, "timestamp": "2025-09-05 08:53:24.998662", "step": 538, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:25.202991", "step": 538, "epoch": 1 }, { "type": "loss", "content": 0.31535568833351135, "timestamp": "2025-09-05 08:53:25.205119", "step": 539, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:25.415088", "step": 539, "epoch": 1 }, { "type": "loss", "content": 0.30104270577430725, "timestamp": "2025-09-05 08:53:25.430097", "step": 540, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:53:30.312544", "step": 540, "epoch": 1 }, { "type": "pplx", "content": 59.515503396161826, "timestamp": "2025-09-05 08:53:30.316745", "step": 540, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:30.481458", "step": 540, "epoch": 1 }, { "type": "loss", "content": 0.38238897919654846, "timestamp": "2025-09-05 08:53:30.483585", "step": 541, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:30.652504", "step": 541, "epoch": 1 }, { "type": "loss", "content": 0.3997941017150879, "timestamp": "2025-09-05 08:53:30.656351", "step": 542, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:30.862613", "step": 542, "epoch": 1 }, { "type": "loss", "content": 0.38764917850494385, "timestamp": "2025-09-05 08:53:30.865289", "step": 543, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:31.115871", "step": 543, "epoch": 1 }, { "type": "loss", "content": 0.2606298327445984, "timestamp": "2025-09-05 08:53:31.132908", "step": 544, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:31.331423", "step": 544, "epoch": 1 }, { "type": "loss", "content": 0.4200422465801239, "timestamp": "2025-09-05 08:53:31.333568", "step": 545, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:31.542945", "step": 545, "epoch": 1 }, { "type": "loss", "content": 0.30847230553627014, "timestamp": "2025-09-05 08:53:31.545735", "step": 546, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:31.755943", "step": 546, "epoch": 1 }, { "type": "loss", "content": 0.3630848228931427, "timestamp": "2025-09-05 08:53:31.758184", "step": 547, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:53:31.961967", "step": 547, "epoch": 1 }, { "type": "loss", "content": 0.4338149428367615, "timestamp": "2025-09-05 08:53:31.982517", "step": 548, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:32.181564", "step": 548, "epoch": 1 }, { "type": "loss", "content": 0.43823084235191345, "timestamp": "2025-09-05 08:53:32.183709", "step": 549, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:32.393487", "step": 549, "epoch": 1 }, { "type": "loss", "content": 0.5095869898796082, "timestamp": "2025-09-05 08:53:32.395847", "step": 550, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:32.607294", "step": 550, "epoch": 1 }, { "type": "loss", "content": 0.4471019506454468, "timestamp": "2025-09-05 08:53:32.609524", "step": 551, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:32.821073", "step": 551, "epoch": 1 }, { "type": "loss", "content": 0.2290228307247162, "timestamp": "2025-09-05 08:53:32.839258", "step": 552, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:33.041860", "step": 552, "epoch": 1 }, { "type": "loss", "content": 0.23595690727233887, "timestamp": "2025-09-05 08:53:33.044168", "step": 553, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:53:33.253035", "step": 553, "epoch": 1 }, { "type": "loss", "content": 0.19498829543590546, "timestamp": "2025-09-05 08:53:33.255795", "step": 554, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:33.465968", "step": 554, "epoch": 1 }, { "type": "loss", "content": 0.30002859234809875, "timestamp": "2025-09-05 08:53:33.467878", "step": 555, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:33.681230", "step": 555, "epoch": 1 }, { "type": "loss", "content": 0.3351731300354004, "timestamp": "2025-09-05 08:53:33.695980", "step": 556, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:33.940273", "step": 556, "epoch": 1 }, { "type": "loss", "content": 0.3931390345096588, "timestamp": "2025-09-05 08:53:33.943284", "step": 557, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:34.150261", "step": 557, "epoch": 1 }, { "type": "loss", "content": 0.28822633624076843, "timestamp": "2025-09-05 08:53:34.153543", "step": 558, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:53:34.351789", "step": 558, "epoch": 1 }, { "type": "loss", "content": 0.32444193959236145, "timestamp": "2025-09-05 08:53:34.354902", "step": 559, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:34.561937", "step": 559, "epoch": 1 }, { "type": "loss", "content": 0.3384038209915161, "timestamp": "2025-09-05 08:53:34.621017", "step": 560, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:53:39.663441", "step": 560, "epoch": 1 }, { "type": "pplx", "content": 59.60952474637825, "timestamp": "2025-09-05 08:53:39.666181", "step": 560, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 560", "timestamp": "2025-09-05 08:53:40.136439", "step": 560, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:40.320945", "step": 560, "epoch": 1 }, { "type": "loss", "content": 0.2801932096481323, "timestamp": "2025-09-05 08:53:40.323253", "step": 561, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:40.532084", "step": 561, "epoch": 1 }, { "type": "loss", "content": 0.41452479362487793, "timestamp": "2025-09-05 08:53:40.534062", "step": 562, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:40.735016", "step": 562, "epoch": 1 }, { "type": "loss", "content": 0.24028798937797546, "timestamp": "2025-09-05 08:53:40.737869", "step": 563, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:40.937501", "step": 563, "epoch": 1 }, { "type": "loss", "content": 0.3565094769001007, "timestamp": "2025-09-05 08:53:40.952391", "step": 564, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:41.150448", "step": 564, "epoch": 1 }, { "type": "loss", "content": 0.493753045797348, "timestamp": "2025-09-05 08:53:41.152826", "step": 565, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:41.350962", "step": 565, "epoch": 1 }, { "type": "loss", "content": 0.3578570485115051, "timestamp": "2025-09-05 08:53:41.353796", "step": 566, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:41.562560", "step": 566, "epoch": 1 }, { "type": "loss", "content": 0.3159957230091095, "timestamp": "2025-09-05 08:53:41.584877", "step": 567, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:41.804368", "step": 567, "epoch": 1 }, { "type": "loss", "content": 0.3041984736919403, "timestamp": "2025-09-05 08:53:41.819203", "step": 568, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:42.007943", "step": 568, "epoch": 1 }, { "type": "loss", "content": 0.32441309094429016, "timestamp": "2025-09-05 08:53:42.011035", "step": 569, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:42.219961", "step": 569, "epoch": 1 }, { "type": "loss", "content": 0.2862820029258728, "timestamp": "2025-09-05 08:53:42.222290", "step": 570, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:42.424797", "step": 570, "epoch": 1 }, { "type": "loss", "content": 0.3482998013496399, "timestamp": "2025-09-05 08:53:42.426971", "step": 571, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:42.628721", "step": 571, "epoch": 1 }, { "type": "loss", "content": 0.31323230266571045, "timestamp": "2025-09-05 08:53:42.646181", "step": 572, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:42.835879", "step": 572, "epoch": 1 }, { "type": "loss", "content": 0.348871648311615, "timestamp": "2025-09-05 08:53:42.837879", "step": 573, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:43.034316", "step": 573, "epoch": 1 }, { "type": "loss", "content": 0.3106140196323395, "timestamp": "2025-09-05 08:53:43.036776", "step": 574, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:43.254863", "step": 574, "epoch": 1 }, { "type": "loss", "content": 0.47986507415771484, "timestamp": "2025-09-05 08:53:43.256993", "step": 575, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:43.455155", "step": 575, "epoch": 1 }, { "type": "loss", "content": 0.4856138229370117, "timestamp": "2025-09-05 08:53:43.469870", "step": 576, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:43.659035", "step": 576, "epoch": 1 }, { "type": "loss", "content": 0.23853756487369537, "timestamp": "2025-09-05 08:53:43.661541", "step": 577, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:43.837556", "step": 577, "epoch": 1 }, { "type": "loss", "content": 0.34495970606803894, "timestamp": "2025-09-05 08:53:43.840008", "step": 578, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:44.024806", "step": 578, "epoch": 1 }, { "type": "loss", "content": 0.3690643906593323, "timestamp": "2025-09-05 08:53:44.028077", "step": 579, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:44.212378", "step": 579, "epoch": 1 }, { "type": "loss", "content": 0.33360567688941956, "timestamp": "2025-09-05 08:53:44.229347", "step": 580, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:53:49.114108", "step": 580, "epoch": 1 }, { "type": "pplx", "content": 60.10085432045332, "timestamp": "2025-09-05 08:53:49.117763", "step": 580, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:49.263689", "step": 580, "epoch": 1 }, { "type": "loss", "content": 0.3735733926296234, "timestamp": "2025-09-05 08:53:49.265593", "step": 581, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:49.413395", "step": 581, "epoch": 1 }, { "type": "loss", "content": 0.4649202227592468, "timestamp": "2025-09-05 08:53:49.420816", "step": 582, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:49.651509", "step": 582, "epoch": 1 }, { "type": "loss", "content": 0.3592207431793213, "timestamp": "2025-09-05 08:53:49.653601", "step": 583, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:49.843228", "step": 583, "epoch": 1 }, { "type": "loss", "content": 0.45228156447410583, "timestamp": "2025-09-05 08:53:49.861343", "step": 584, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:50.039570", "step": 584, "epoch": 1 }, { "type": "loss", "content": 0.31664058566093445, "timestamp": "2025-09-05 08:53:50.042225", "step": 585, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:50.218017", "step": 585, "epoch": 1 }, { "type": "loss", "content": 0.33015117049217224, "timestamp": "2025-09-05 08:53:50.220194", "step": 586, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:50.400985", "step": 586, "epoch": 1 }, { "type": "loss", "content": 0.367570698261261, "timestamp": "2025-09-05 08:53:50.404104", "step": 587, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:50.611204", "step": 587, "epoch": 1 }, { "type": "loss", "content": 0.2882497310638428, "timestamp": "2025-09-05 08:53:50.629492", "step": 588, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:50.813865", "step": 588, "epoch": 1 }, { "type": "loss", "content": 0.37059807777404785, "timestamp": "2025-09-05 08:53:50.817712", "step": 589, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:51.023041", "step": 589, "epoch": 1 }, { "type": "loss", "content": 0.3063223361968994, "timestamp": "2025-09-05 08:53:51.025798", "step": 590, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:51.213430", "step": 590, "epoch": 1 }, { "type": "loss", "content": 0.45722252130508423, "timestamp": "2025-09-05 08:53:51.256253", "step": 591, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:51.435498", "step": 591, "epoch": 1 }, { "type": "loss", "content": 0.23291143774986267, "timestamp": "2025-09-05 08:53:51.449993", "step": 592, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:51.620906", "step": 592, "epoch": 1 }, { "type": "loss", "content": 0.3415684700012207, "timestamp": "2025-09-05 08:53:51.624124", "step": 593, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:51.808049", "step": 593, "epoch": 1 }, { "type": "loss", "content": 0.3442092537879944, "timestamp": "2025-09-05 08:53:51.810700", "step": 594, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:51.988789", "step": 594, "epoch": 1 }, { "type": "loss", "content": 0.27133557200431824, "timestamp": "2025-09-05 08:53:51.991133", "step": 595, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:52.243889", "step": 595, "epoch": 1 }, { "type": "loss", "content": 0.27585190534591675, "timestamp": "2025-09-05 08:53:52.258810", "step": 596, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:52.426840", "step": 596, "epoch": 1 }, { "type": "loss", "content": 0.41854625940322876, "timestamp": "2025-09-05 08:53:52.428957", "step": 597, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:52.606477", "step": 597, "epoch": 1 }, { "type": "loss", "content": 0.4163596034049988, "timestamp": "2025-09-05 08:53:52.609183", "step": 598, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:53:52.794736", "step": 598, "epoch": 1 }, { "type": "loss", "content": 0.24686819314956665, "timestamp": "2025-09-05 08:53:52.797153", "step": 599, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:53:52.993744", "step": 599, "epoch": 1 }, { "type": "loss", "content": 0.27659153938293457, "timestamp": "2025-09-05 08:53:53.008373", "step": 600, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:53:58.070605", "step": 600, "epoch": 1 }, { "type": "pplx", "content": 60.150732301423496, "timestamp": "2025-09-05 08:53:58.072774", "step": 600, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 600", "timestamp": "2025-09-05 08:53:58.524032", "step": 600, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:53:58.695316", "step": 600, "epoch": 1 }, { "type": "loss", "content": 0.36288002133369446, "timestamp": "2025-09-05 08:53:58.697367", "step": 601, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:58.904083", "step": 601, "epoch": 1 }, { "type": "loss", "content": 0.2768324017524719, "timestamp": "2025-09-05 08:53:58.906425", "step": 602, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:53:59.104371", "step": 602, "epoch": 1 }, { "type": "loss", "content": 0.3677636682987213, "timestamp": "2025-09-05 08:53:59.106677", "step": 603, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:59.315197", "step": 603, "epoch": 1 }, { "type": "loss", "content": 0.5175231099128723, "timestamp": "2025-09-05 08:53:59.329583", "step": 604, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:53:59.521505", "step": 604, "epoch": 1 }, { "type": "loss", "content": 0.30161038041114807, "timestamp": "2025-09-05 08:53:59.524024", "step": 605, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:53:59.720228", "step": 605, "epoch": 1 }, { "type": "loss", "content": 0.19638413190841675, "timestamp": "2025-09-05 08:53:59.723539", "step": 606, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:53:59.931213", "step": 606, "epoch": 1 }, { "type": "loss", "content": 0.403491348028183, "timestamp": "2025-09-05 08:53:59.933056", "step": 607, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:00.142014", "step": 607, "epoch": 1 }, { "type": "loss", "content": 0.5172940492630005, "timestamp": "2025-09-05 08:54:00.158712", "step": 608, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:00.360444", "step": 608, "epoch": 1 }, { "type": "loss", "content": 0.43559950590133667, "timestamp": "2025-09-05 08:54:00.363569", "step": 609, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:00.575220", "step": 609, "epoch": 1 }, { "type": "loss", "content": 0.39010724425315857, "timestamp": "2025-09-05 08:54:00.578011", "step": 610, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:00.785308", "step": 610, "epoch": 1 }, { "type": "loss", "content": 0.38972899317741394, "timestamp": "2025-09-05 08:54:00.788490", "step": 611, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:54:00.998969", "step": 611, "epoch": 1 }, { "type": "loss", "content": 0.26329943537712097, "timestamp": "2025-09-05 08:54:01.015379", "step": 612, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:01.213718", "step": 612, "epoch": 1 }, { "type": "loss", "content": 0.47966381907463074, "timestamp": "2025-09-05 08:54:01.216175", "step": 613, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:01.426283", "step": 613, "epoch": 1 }, { "type": "loss", "content": 0.3759992718696594, "timestamp": "2025-09-05 08:54:01.429304", "step": 614, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:01.648956", "step": 614, "epoch": 1 }, { "type": "loss", "content": 0.43966683745384216, "timestamp": "2025-09-05 08:54:01.650896", "step": 615, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:01.857825", "step": 615, "epoch": 1 }, { "type": "loss", "content": 0.2430787980556488, "timestamp": "2025-09-05 08:54:01.872042", "step": 616, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:02.064318", "step": 616, "epoch": 1 }, { "type": "loss", "content": 0.3986109793186188, "timestamp": "2025-09-05 08:54:02.066439", "step": 617, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:02.314801", "step": 617, "epoch": 1 }, { "type": "loss", "content": 0.34388166666030884, "timestamp": "2025-09-05 08:54:02.347074", "step": 618, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:02.556618", "step": 618, "epoch": 1 }, { "type": "loss", "content": 0.23649443686008453, "timestamp": "2025-09-05 08:54:02.559154", "step": 619, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:02.759787", "step": 619, "epoch": 1 }, { "type": "loss", "content": 0.3429713249206543, "timestamp": "2025-09-05 08:54:02.776171", "step": 620, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:54:07.720311", "step": 620, "epoch": 1 }, { "type": "pplx", "content": 60.0059737944772, "timestamp": "2025-09-05 08:54:07.722616", "step": 620, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:07.886060", "step": 620, "epoch": 1 }, { "type": "loss", "content": 0.38172709941864014, "timestamp": "2025-09-05 08:54:07.888163", "step": 621, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:08.085941", "step": 621, "epoch": 1 }, { "type": "loss", "content": 0.29247331619262695, "timestamp": "2025-09-05 08:54:08.089072", "step": 622, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:08.289376", "step": 622, "epoch": 1 }, { "type": "loss", "content": 0.30984073877334595, "timestamp": "2025-09-05 08:54:08.291291", "step": 623, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:54:08.489187", "step": 623, "epoch": 1 }, { "type": "loss", "content": 0.4325138032436371, "timestamp": "2025-09-05 08:54:08.504002", "step": 624, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:08.693990", "step": 624, "epoch": 1 }, { "type": "loss", "content": 0.38128820061683655, "timestamp": "2025-09-05 08:54:08.697641", "step": 625, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:54:08.897279", "step": 625, "epoch": 1 }, { "type": "loss", "content": 0.34949132800102234, "timestamp": "2025-09-05 08:54:08.899825", "step": 626, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:09.107520", "step": 626, "epoch": 1 }, { "type": "loss", "content": 0.21804000437259674, "timestamp": "2025-09-05 08:54:09.109945", "step": 627, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:09.318482", "step": 627, "epoch": 1 }, { "type": "loss", "content": 0.31279170513153076, "timestamp": "2025-09-05 08:54:09.334969", "step": 628, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:09.524559", "step": 628, "epoch": 1 }, { "type": "loss", "content": 0.4154609441757202, "timestamp": "2025-09-05 08:54:09.527910", "step": 629, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:09.724828", "step": 629, "epoch": 1 }, { "type": "loss", "content": 0.36364659667015076, "timestamp": "2025-09-05 08:54:09.727232", "step": 630, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:09.923511", "step": 630, "epoch": 1 }, { "type": "loss", "content": 0.566509485244751, "timestamp": "2025-09-05 08:54:09.925616", "step": 631, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:10.122125", "step": 631, "epoch": 1 }, { "type": "loss", "content": 0.2817482054233551, "timestamp": "2025-09-05 08:54:10.136938", "step": 632, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:10.328401", "step": 632, "epoch": 1 }, { "type": "loss", "content": 0.4373268485069275, "timestamp": "2025-09-05 08:54:10.330653", "step": 633, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:10.537850", "step": 633, "epoch": 1 }, { "type": "loss", "content": 0.2138715237379074, "timestamp": "2025-09-05 08:54:10.540845", "step": 634, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:10.738631", "step": 634, "epoch": 1 }, { "type": "loss", "content": 0.3299078643321991, "timestamp": "2025-09-05 08:54:10.740930", "step": 635, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:10.937830", "step": 635, "epoch": 1 }, { "type": "loss", "content": 0.38988572359085083, "timestamp": "2025-09-05 08:54:10.952014", "step": 636, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:11.142159", "step": 636, "epoch": 1 }, { "type": "loss", "content": 0.5013632774353027, "timestamp": "2025-09-05 08:54:11.146725", "step": 637, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:11.353235", "step": 637, "epoch": 1 }, { "type": "loss", "content": 0.25191569328308105, "timestamp": "2025-09-05 08:54:11.355726", "step": 638, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:11.552894", "step": 638, "epoch": 1 }, { "type": "loss", "content": 0.3327215313911438, "timestamp": "2025-09-05 08:54:11.556011", "step": 639, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:11.752920", "step": 639, "epoch": 1 }, { "type": "loss", "content": 0.3358171880245209, "timestamp": "2025-09-05 08:54:11.767103", "step": 640, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:54:16.930895", "step": 640, "epoch": 1 }, { "type": "pplx", "content": 60.013876298957854, "timestamp": "2025-09-05 08:54:16.932912", "step": 640, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 640", "timestamp": "2025-09-05 08:54:17.406898", "step": 640, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:17.592032", "step": 640, "epoch": 1 }, { "type": "loss", "content": 0.49637261033058167, "timestamp": "2025-09-05 08:54:17.593921", "step": 641, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:17.799004", "step": 641, "epoch": 1 }, { "type": "loss", "content": 0.33820703625679016, "timestamp": "2025-09-05 08:54:17.801216", "step": 642, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:18.013549", "step": 642, "epoch": 1 }, { "type": "loss", "content": 0.3308141529560089, "timestamp": "2025-09-05 08:54:18.016186", "step": 643, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:18.258292", "step": 643, "epoch": 1 }, { "type": "loss", "content": 0.40199655294418335, "timestamp": "2025-09-05 08:54:18.273111", "step": 644, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:18.462659", "step": 644, "epoch": 1 }, { "type": "loss", "content": 0.30657947063446045, "timestamp": "2025-09-05 08:54:18.464765", "step": 645, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:18.665532", "step": 645, "epoch": 1 }, { "type": "loss", "content": 0.3859306573867798, "timestamp": "2025-09-05 08:54:18.667913", "step": 646, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:18.876768", "step": 646, "epoch": 1 }, { "type": "loss", "content": 0.3470034599304199, "timestamp": "2025-09-05 08:54:18.879140", "step": 647, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:19.110177", "step": 647, "epoch": 1 }, { "type": "loss", "content": 0.3591715395450592, "timestamp": "2025-09-05 08:54:19.125306", "step": 648, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:19.314904", "step": 648, "epoch": 1 }, { "type": "loss", "content": 0.29138100147247314, "timestamp": "2025-09-05 08:54:19.317592", "step": 649, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:19.526396", "step": 649, "epoch": 1 }, { "type": "loss", "content": 0.48001521825790405, "timestamp": "2025-09-05 08:54:19.529551", "step": 650, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:19.734777", "step": 650, "epoch": 1 }, { "type": "loss", "content": 0.30436667799949646, "timestamp": "2025-09-05 08:54:19.738102", "step": 651, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:19.944375", "step": 651, "epoch": 1 }, { "type": "loss", "content": 0.41145777702331543, "timestamp": "2025-09-05 08:54:19.960171", "step": 652, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:54:20.150221", "step": 652, "epoch": 1 }, { "type": "loss", "content": 0.4586491584777832, "timestamp": "2025-09-05 08:54:20.152746", "step": 653, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:20.359771", "step": 653, "epoch": 1 }, { "type": "loss", "content": 0.2969052791595459, "timestamp": "2025-09-05 08:54:20.362605", "step": 654, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:20.572199", "step": 654, "epoch": 1 }, { "type": "loss", "content": 0.4161323010921478, "timestamp": "2025-09-05 08:54:20.575376", "step": 655, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:20.782477", "step": 655, "epoch": 1 }, { "type": "loss", "content": 0.36784666776657104, "timestamp": "2025-09-05 08:54:20.799260", "step": 656, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:21.003719", "step": 656, "epoch": 1 }, { "type": "loss", "content": 0.30674105882644653, "timestamp": "2025-09-05 08:54:21.005922", "step": 657, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:21.204382", "step": 657, "epoch": 1 }, { "type": "loss", "content": 0.3142521381378174, "timestamp": "2025-09-05 08:54:21.207457", "step": 658, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:21.421749", "step": 658, "epoch": 1 }, { "type": "loss", "content": 0.4763060212135315, "timestamp": "2025-09-05 08:54:21.425096", "step": 659, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:21.631974", "step": 659, "epoch": 1 }, { "type": "loss", "content": 0.2550380825996399, "timestamp": "2025-09-05 08:54:21.646742", "step": 660, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:54:26.783208", "step": 660, "epoch": 1 }, { "type": "pplx", "content": 59.977893482287755, "timestamp": "2025-09-05 08:54:26.785662", "step": 660, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:26.946480", "step": 660, "epoch": 1 }, { "type": "loss", "content": 0.3140474855899811, "timestamp": "2025-09-05 08:54:26.949023", "step": 661, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:27.117596", "step": 661, "epoch": 1 }, { "type": "loss", "content": 0.5123893022537231, "timestamp": "2025-09-05 08:54:27.120661", "step": 662, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:27.326935", "step": 662, "epoch": 1 }, { "type": "loss", "content": 0.3251909017562866, "timestamp": "2025-09-05 08:54:27.328976", "step": 663, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:27.525600", "step": 663, "epoch": 1 }, { "type": "loss", "content": 0.34844574332237244, "timestamp": "2025-09-05 08:54:27.540739", "step": 664, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:54:27.740184", "step": 664, "epoch": 1 }, { "type": "loss", "content": 0.4417649209499359, "timestamp": "2025-09-05 08:54:27.823505", "step": 665, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:28.118452", "step": 665, "epoch": 1 }, { "type": "loss", "content": 0.2881294786930084, "timestamp": "2025-09-05 08:54:28.120838", "step": 666, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:28.319720", "step": 666, "epoch": 1 }, { "type": "loss", "content": 0.3033196032047272, "timestamp": "2025-09-05 08:54:28.322799", "step": 667, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:28.532384", "step": 667, "epoch": 1 }, { "type": "loss", "content": 0.35287392139434814, "timestamp": "2025-09-05 08:54:28.547149", "step": 668, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:28.739925", "step": 668, "epoch": 1 }, { "type": "loss", "content": 0.3460994362831116, "timestamp": "2025-09-05 08:54:28.816991", "step": 669, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:29.071266", "step": 669, "epoch": 1 }, { "type": "loss", "content": 0.2680738866329193, "timestamp": "2025-09-05 08:54:29.073805", "step": 670, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:29.279231", "step": 670, "epoch": 1 }, { "type": "loss", "content": 0.3204239010810852, "timestamp": "2025-09-05 08:54:29.286163", "step": 671, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:29.489604", "step": 671, "epoch": 1 }, { "type": "loss", "content": 0.2820996344089508, "timestamp": "2025-09-05 08:54:29.508998", "step": 672, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:29.798942", "step": 672, "epoch": 1 }, { "type": "loss", "content": 0.3304928243160248, "timestamp": "2025-09-05 08:54:29.801447", "step": 673, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:30.010222", "step": 673, "epoch": 1 }, { "type": "loss", "content": 0.3490171730518341, "timestamp": "2025-09-05 08:54:30.013730", "step": 674, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:30.224578", "step": 674, "epoch": 1 }, { "type": "loss", "content": 0.39344292879104614, "timestamp": "2025-09-05 08:54:30.227117", "step": 675, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:30.423153", "step": 675, "epoch": 1 }, { "type": "loss", "content": 0.2898615300655365, "timestamp": "2025-09-05 08:54:30.440851", "step": 676, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:30.692418", "step": 676, "epoch": 1 }, { "type": "loss", "content": 0.4104023575782776, "timestamp": "2025-09-05 08:54:30.696366", "step": 677, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:30.902770", "step": 677, "epoch": 1 }, { "type": "loss", "content": 0.30414190888404846, "timestamp": "2025-09-05 08:54:30.905070", "step": 678, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:31.103059", "step": 678, "epoch": 1 }, { "type": "loss", "content": 0.4684240520000458, "timestamp": "2025-09-05 08:54:31.106036", "step": 679, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:54:31.312497", "step": 679, "epoch": 1 }, { "type": "loss", "content": 0.34773412346839905, "timestamp": "2025-09-05 08:54:31.328594", "step": 680, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:54:36.267599", "step": 680, "epoch": 1 }, { "type": "pplx", "content": 60.13139868121867, "timestamp": "2025-09-05 08:54:36.271850", "step": 680, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 680", "timestamp": "2025-09-05 08:54:36.750503", "step": 680, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:36.998015", "step": 680, "epoch": 1 }, { "type": "loss", "content": 0.3354809880256653, "timestamp": "2025-09-05 08:54:37.000382", "step": 681, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:37.200209", "step": 681, "epoch": 1 }, { "type": "loss", "content": 0.45760607719421387, "timestamp": "2025-09-05 08:54:37.203851", "step": 682, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:37.402177", "step": 682, "epoch": 1 }, { "type": "loss", "content": 0.19389431178569794, "timestamp": "2025-09-05 08:54:37.404069", "step": 683, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:37.603549", "step": 683, "epoch": 1 }, { "type": "loss", "content": 0.45813360810279846, "timestamp": "2025-09-05 08:54:37.621567", "step": 684, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 08:54:37.823186", "step": 684, "epoch": 1 }, { "type": "loss", "content": 0.4114960730075836, "timestamp": "2025-09-05 08:54:37.825184", "step": 685, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:38.029268", "step": 685, "epoch": 1 }, { "type": "loss", "content": 0.320042222738266, "timestamp": "2025-09-05 08:54:38.032156", "step": 686, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:54:38.228650", "step": 686, "epoch": 1 }, { "type": "loss", "content": 0.3017887473106384, "timestamp": "2025-09-05 08:54:38.230931", "step": 687, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:38.438526", "step": 687, "epoch": 1 }, { "type": "loss", "content": 0.19925743341445923, "timestamp": "2025-09-05 08:54:38.455228", "step": 688, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:38.655031", "step": 688, "epoch": 1 }, { "type": "loss", "content": 0.3546088635921478, "timestamp": "2025-09-05 08:54:38.656855", "step": 689, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:38.855417", "step": 689, "epoch": 1 }, { "type": "loss", "content": 0.2611202001571655, "timestamp": "2025-09-05 08:54:38.857665", "step": 690, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:39.064991", "step": 690, "epoch": 1 }, { "type": "loss", "content": 0.4317798912525177, "timestamp": "2025-09-05 08:54:39.067201", "step": 691, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:39.262353", "step": 691, "epoch": 1 }, { "type": "loss", "content": 0.3688901662826538, "timestamp": "2025-09-05 08:54:39.279461", "step": 692, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:39.475335", "step": 692, "epoch": 1 }, { "type": "loss", "content": 0.288263201713562, "timestamp": "2025-09-05 08:54:39.477507", "step": 693, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 08:54:39.685377", "step": 693, "epoch": 1 }, { "type": "loss", "content": 0.38892319798469543, "timestamp": "2025-09-05 08:54:39.687607", "step": 694, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:39.894436", "step": 694, "epoch": 1 }, { "type": "loss", "content": 0.334379643201828, "timestamp": "2025-09-05 08:54:39.897342", "step": 695, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:40.100425", "step": 695, "epoch": 1 }, { "type": "loss", "content": 0.37965139746665955, "timestamp": "2025-09-05 08:54:40.116980", "step": 696, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:40.314915", "step": 696, "epoch": 1 }, { "type": "loss", "content": 0.380183607339859, "timestamp": "2025-09-05 08:54:40.317039", "step": 697, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:40.516922", "step": 697, "epoch": 1 }, { "type": "loss", "content": 0.37185657024383545, "timestamp": "2025-09-05 08:54:40.519343", "step": 698, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:40.717729", "step": 698, "epoch": 1 }, { "type": "loss", "content": 0.3406052589416504, "timestamp": "2025-09-05 08:54:40.720247", "step": 699, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:40.918093", "step": 699, "epoch": 1 }, { "type": "loss", "content": 0.29893720149993896, "timestamp": "2025-09-05 08:54:40.933084", "step": 700, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:54:45.631525", "step": 700, "epoch": 1 }, { "type": "pplx", "content": 60.328020476201374, "timestamp": "2025-09-05 08:54:45.634419", "step": 700, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:45.796220", "step": 700, "epoch": 1 }, { "type": "loss", "content": 0.2962213158607483, "timestamp": "2025-09-05 08:54:45.798188", "step": 701, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:46.001282", "step": 701, "epoch": 1 }, { "type": "loss", "content": 0.36525845527648926, "timestamp": "2025-09-05 08:54:46.003188", "step": 702, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:46.201526", "step": 702, "epoch": 1 }, { "type": "loss", "content": 0.46742987632751465, "timestamp": "2025-09-05 08:54:46.203916", "step": 703, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:46.399294", "step": 703, "epoch": 1 }, { "type": "loss", "content": 0.26656708121299744, "timestamp": "2025-09-05 08:54:46.416569", "step": 704, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:46.614553", "step": 704, "epoch": 1 }, { "type": "loss", "content": 0.3398812711238861, "timestamp": "2025-09-05 08:54:46.616644", "step": 705, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:46.811613", "step": 705, "epoch": 1 }, { "type": "loss", "content": 0.31698310375213623, "timestamp": "2025-09-05 08:54:46.813745", "step": 706, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:47.010716", "step": 706, "epoch": 1 }, { "type": "loss", "content": 0.3885927200317383, "timestamp": "2025-09-05 08:54:47.012461", "step": 707, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:47.216767", "step": 707, "epoch": 1 }, { "type": "loss", "content": 0.26296836137771606, "timestamp": "2025-09-05 08:54:47.230773", "step": 708, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:47.427633", "step": 708, "epoch": 1 }, { "type": "loss", "content": 0.41065317392349243, "timestamp": "2025-09-05 08:54:47.429933", "step": 709, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:47.626215", "step": 709, "epoch": 1 }, { "type": "loss", "content": 0.3585359752178192, "timestamp": "2025-09-05 08:54:47.628728", "step": 710, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:54:47.824583", "step": 710, "epoch": 1 }, { "type": "loss", "content": 0.40104833245277405, "timestamp": "2025-09-05 08:54:47.828309", "step": 711, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:48.022731", "step": 711, "epoch": 1 }, { "type": "loss", "content": 0.4005032777786255, "timestamp": "2025-09-05 08:54:48.036802", "step": 712, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:48.224089", "step": 712, "epoch": 1 }, { "type": "loss", "content": 0.3810969591140747, "timestamp": "2025-09-05 08:54:48.226238", "step": 713, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:48.421427", "step": 713, "epoch": 1 }, { "type": "loss", "content": 0.4647989571094513, "timestamp": "2025-09-05 08:54:48.423241", "step": 714, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:48.627565", "step": 714, "epoch": 1 }, { "type": "loss", "content": 0.26305657625198364, "timestamp": "2025-09-05 08:54:48.629519", "step": 715, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:48.826866", "step": 715, "epoch": 1 }, { "type": "loss", "content": 0.3350170850753784, "timestamp": "2025-09-05 08:54:48.841368", "step": 716, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:49.030348", "step": 716, "epoch": 1 }, { "type": "loss", "content": 0.2926214337348938, "timestamp": "2025-09-05 08:54:49.032122", "step": 717, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:49.239250", "step": 717, "epoch": 1 }, { "type": "loss", "content": 0.4400876760482788, "timestamp": "2025-09-05 08:54:49.241213", "step": 718, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:49.448651", "step": 718, "epoch": 1 }, { "type": "loss", "content": 0.2703559994697571, "timestamp": "2025-09-05 08:54:49.450304", "step": 719, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:49.657250", "step": 719, "epoch": 1 }, { "type": "loss", "content": 0.1777232140302658, "timestamp": "2025-09-05 08:54:49.673678", "step": 720, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:54:54.347893", "step": 720, "epoch": 1 }, { "type": "pplx", "content": 60.46760355156991, "timestamp": "2025-09-05 08:54:54.349759", "step": 720, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 720", "timestamp": "2025-09-05 08:54:54.823525", "step": 720, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:55.017249", "step": 720, "epoch": 1 }, { "type": "loss", "content": 0.3396753668785095, "timestamp": "2025-09-05 08:54:55.019130", "step": 721, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:55.227345", "step": 721, "epoch": 1 }, { "type": "loss", "content": 0.45180007815361023, "timestamp": "2025-09-05 08:54:55.228947", "step": 722, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:55.424212", "step": 722, "epoch": 1 }, { "type": "loss", "content": 0.43839141726493835, "timestamp": "2025-09-05 08:54:55.426285", "step": 723, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:55.623172", "step": 723, "epoch": 1 }, { "type": "loss", "content": 0.38653379678726196, "timestamp": "2025-09-05 08:54:55.640180", "step": 724, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:55.831590", "step": 724, "epoch": 1 }, { "type": "loss", "content": 0.302320271730423, "timestamp": "2025-09-05 08:54:55.833560", "step": 725, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:56.038980", "step": 725, "epoch": 1 }, { "type": "loss", "content": 0.3913811147212982, "timestamp": "2025-09-05 08:54:56.040877", "step": 726, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:56.246521", "step": 726, "epoch": 1 }, { "type": "loss", "content": 0.2894538640975952, "timestamp": "2025-09-05 08:54:56.248271", "step": 727, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:56.452672", "step": 727, "epoch": 1 }, { "type": "loss", "content": 0.29082658886909485, "timestamp": "2025-09-05 08:54:56.469163", "step": 728, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:56.664440", "step": 728, "epoch": 1 }, { "type": "loss", "content": 0.4690100848674774, "timestamp": "2025-09-05 08:54:56.667290", "step": 729, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:56.863708", "step": 729, "epoch": 1 }, { "type": "loss", "content": 0.4146786332130432, "timestamp": "2025-09-05 08:54:56.866143", "step": 730, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:54:57.061283", "step": 730, "epoch": 1 }, { "type": "loss", "content": 0.4398282766342163, "timestamp": "2025-09-05 08:54:57.063210", "step": 731, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:57.259524", "step": 731, "epoch": 1 }, { "type": "loss", "content": 0.3548581600189209, "timestamp": "2025-09-05 08:54:57.273889", "step": 732, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:57.459570", "step": 732, "epoch": 1 }, { "type": "loss", "content": 0.2405700534582138, "timestamp": "2025-09-05 08:54:57.461338", "step": 733, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:57.657548", "step": 733, "epoch": 1 }, { "type": "loss", "content": 0.40556153655052185, "timestamp": "2025-09-05 08:54:57.659320", "step": 734, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:54:57.823137", "step": 734, "epoch": 1 }, { "type": "loss", "content": 0.3558599650859833, "timestamp": "2025-09-05 08:54:57.825123", "step": 735, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:58.028019", "step": 735, "epoch": 1 }, { "type": "loss", "content": 0.3774275779724121, "timestamp": "2025-09-05 08:54:58.042770", "step": 736, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:54:58.242072", "step": 736, "epoch": 1 }, { "type": "loss", "content": 0.4390884339809418, "timestamp": "2025-09-05 08:54:58.245108", "step": 737, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:54:58.441939", "step": 737, "epoch": 1 }, { "type": "loss", "content": 0.27866774797439575, "timestamp": "2025-09-05 08:54:58.451705", "step": 738, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:54:58.648957", "step": 738, "epoch": 1 }, { "type": "loss", "content": 0.2765721082687378, "timestamp": "2025-09-05 08:54:58.653039", "step": 739, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:54:58.848295", "step": 739, "epoch": 1 }, { "type": "loss", "content": 0.29264095425605774, "timestamp": "2025-09-05 08:54:58.862799", "step": 740, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:55:03.526058", "step": 740, "epoch": 1 }, { "type": "pplx", "content": 60.057011510032595, "timestamp": "2025-09-05 08:55:03.528500", "step": 740, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:03.689395", "step": 740, "epoch": 1 }, { "type": "loss", "content": 0.24096165597438812, "timestamp": "2025-09-05 08:55:03.691725", "step": 741, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:03.861152", "step": 741, "epoch": 1 }, { "type": "loss", "content": 0.276774525642395, "timestamp": "2025-09-05 08:55:03.863850", "step": 742, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:04.067519", "step": 742, "epoch": 1 }, { "type": "loss", "content": 0.4308259189128876, "timestamp": "2025-09-05 08:55:04.069692", "step": 743, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:04.275651", "step": 743, "epoch": 1 }, { "type": "loss", "content": 0.2998303472995758, "timestamp": "2025-09-05 08:55:04.291124", "step": 744, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:55:04.479757", "step": 744, "epoch": 1 }, { "type": "loss", "content": 0.3581094443798065, "timestamp": "2025-09-05 08:55:04.481424", "step": 745, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:04.686029", "step": 745, "epoch": 1 }, { "type": "loss", "content": 0.44069504737854004, "timestamp": "2025-09-05 08:55:04.687941", "step": 746, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:04.892816", "step": 746, "epoch": 1 }, { "type": "loss", "content": 0.2966330647468567, "timestamp": "2025-09-05 08:55:04.894551", "step": 747, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:05.091527", "step": 747, "epoch": 1 }, { "type": "loss", "content": 0.4456374943256378, "timestamp": "2025-09-05 08:55:05.105860", "step": 748, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:05.300254", "step": 748, "epoch": 1 }, { "type": "loss", "content": 0.38735565543174744, "timestamp": "2025-09-05 08:55:05.301897", "step": 749, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:05.496733", "step": 749, "epoch": 1 }, { "type": "loss", "content": 0.3685053884983063, "timestamp": "2025-09-05 08:55:05.498685", "step": 750, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:05.705472", "step": 750, "epoch": 1 }, { "type": "loss", "content": 0.3653118908405304, "timestamp": "2025-09-05 08:55:05.707251", "step": 751, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:05.911657", "step": 751, "epoch": 1 }, { "type": "loss", "content": 0.46427515149116516, "timestamp": "2025-09-05 08:55:05.925953", "step": 752, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:06.115561", "step": 752, "epoch": 1 }, { "type": "loss", "content": 0.33055830001831055, "timestamp": "2025-09-05 08:55:06.117285", "step": 753, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:06.321142", "step": 753, "epoch": 1 }, { "type": "loss", "content": 0.3238837718963623, "timestamp": "2025-09-05 08:55:06.322865", "step": 754, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:06.518158", "step": 754, "epoch": 1 }, { "type": "loss", "content": 0.33274197578430176, "timestamp": "2025-09-05 08:55:06.519817", "step": 755, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:06.716279", "step": 755, "epoch": 1 }, { "type": "loss", "content": 0.6049551367759705, "timestamp": "2025-09-05 08:55:06.732935", "step": 756, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:06.929062", "step": 756, "epoch": 1 }, { "type": "loss", "content": 0.4643118977546692, "timestamp": "2025-09-05 08:55:06.930811", "step": 757, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:07.096596", "step": 757, "epoch": 1 }, { "type": "loss", "content": 0.37005674839019775, "timestamp": "2025-09-05 08:55:07.098944", "step": 758, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:07.306042", "step": 758, "epoch": 1 }, { "type": "loss", "content": 0.270403116941452, "timestamp": "2025-09-05 08:55:07.307925", "step": 759, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:07.512213", "step": 759, "epoch": 1 }, { "type": "loss", "content": 0.3156965374946594, "timestamp": "2025-09-05 08:55:07.521310", "step": 760, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:55:12.156535", "step": 760, "epoch": 1 }, { "type": "pplx", "content": 59.66704724636567, "timestamp": "2025-09-05 08:55:12.158506", "step": 760, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 760", "timestamp": "2025-09-05 08:55:12.626072", "step": 760, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:12.793769", "step": 760, "epoch": 1 }, { "type": "loss", "content": 0.24948005378246307, "timestamp": "2025-09-05 08:55:12.795644", "step": 761, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:12.995458", "step": 761, "epoch": 1 }, { "type": "loss", "content": 0.33802419900894165, "timestamp": "2025-09-05 08:55:12.997015", "step": 762, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:13.200557", "step": 762, "epoch": 1 }, { "type": "loss", "content": 0.2108272910118103, "timestamp": "2025-09-05 08:55:13.202057", "step": 763, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:13.397743", "step": 763, "epoch": 1 }, { "type": "loss", "content": 0.31162819266319275, "timestamp": "2025-09-05 08:55:13.411933", "step": 764, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:13.598710", "step": 764, "epoch": 1 }, { "type": "loss", "content": 0.44521042704582214, "timestamp": "2025-09-05 08:55:13.600271", "step": 765, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:13.795524", "step": 765, "epoch": 1 }, { "type": "loss", "content": 0.3431508243083954, "timestamp": "2025-09-05 08:55:13.797365", "step": 766, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:14.002948", "step": 766, "epoch": 1 }, { "type": "loss", "content": 0.31309953331947327, "timestamp": "2025-09-05 08:55:14.004890", "step": 767, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:14.199262", "step": 767, "epoch": 1 }, { "type": "loss", "content": 0.2508161664009094, "timestamp": "2025-09-05 08:55:14.216268", "step": 768, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:14.416044", "step": 768, "epoch": 1 }, { "type": "loss", "content": 0.35422399640083313, "timestamp": "2025-09-05 08:55:14.417865", "step": 769, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:14.625474", "step": 769, "epoch": 1 }, { "type": "loss", "content": 0.39631184935569763, "timestamp": "2025-09-05 08:55:14.627596", "step": 770, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:55:14.834274", "step": 770, "epoch": 1 }, { "type": "loss", "content": 0.35908469557762146, "timestamp": "2025-09-05 08:55:14.836925", "step": 771, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:15.033942", "step": 771, "epoch": 1 }, { "type": "loss", "content": 0.44913047552108765, "timestamp": "2025-09-05 08:55:15.050281", "step": 772, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:15.248779", "step": 772, "epoch": 1 }, { "type": "loss", "content": 0.35234954953193665, "timestamp": "2025-09-05 08:55:15.250645", "step": 773, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:15.455657", "step": 773, "epoch": 1 }, { "type": "loss", "content": 0.26836010813713074, "timestamp": "2025-09-05 08:55:15.457801", "step": 774, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:15.655351", "step": 774, "epoch": 1 }, { "type": "loss", "content": 0.4194760024547577, "timestamp": "2025-09-05 08:55:15.657289", "step": 775, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:15.857106", "step": 775, "epoch": 1 }, { "type": "loss", "content": 0.4029027223587036, "timestamp": "2025-09-05 08:55:15.871310", "step": 776, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:16.067439", "step": 776, "epoch": 1 }, { "type": "loss", "content": 0.3827188014984131, "timestamp": "2025-09-05 08:55:16.069132", "step": 777, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:16.274296", "step": 777, "epoch": 1 }, { "type": "loss", "content": 0.3042537569999695, "timestamp": "2025-09-05 08:55:16.276538", "step": 778, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:16.485146", "step": 778, "epoch": 1 }, { "type": "loss", "content": 0.3845559358596802, "timestamp": "2025-09-05 08:55:16.486916", "step": 779, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:16.682919", "step": 779, "epoch": 1 }, { "type": "loss", "content": 0.3793331980705261, "timestamp": "2025-09-05 08:55:16.696900", "step": 780, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:55:21.332508", "step": 780, "epoch": 1 }, { "type": "pplx", "content": 59.53960384330279, "timestamp": "2025-09-05 08:55:21.334280", "step": 780, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:21.495075", "step": 780, "epoch": 1 }, { "type": "loss", "content": 0.42627277970314026, "timestamp": "2025-09-05 08:55:21.497499", "step": 781, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:21.665091", "step": 781, "epoch": 1 }, { "type": "loss", "content": 0.2579745650291443, "timestamp": "2025-09-05 08:55:21.666790", "step": 782, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:21.872172", "step": 782, "epoch": 1 }, { "type": "loss", "content": 0.2808438539505005, "timestamp": "2025-09-05 08:55:21.874532", "step": 783, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:22.070858", "step": 783, "epoch": 1 }, { "type": "loss", "content": 0.2570682764053345, "timestamp": "2025-09-05 08:55:22.080329", "step": 784, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:22.242021", "step": 784, "epoch": 1 }, { "type": "loss", "content": 0.39906033873558044, "timestamp": "2025-09-05 08:55:22.243659", "step": 785, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:22.450233", "step": 785, "epoch": 1 }, { "type": "loss", "content": 0.288041889667511, "timestamp": "2025-09-05 08:55:22.452889", "step": 786, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:22.621107", "step": 786, "epoch": 1 }, { "type": "loss", "content": 0.3034781217575073, "timestamp": "2025-09-05 08:55:22.623097", "step": 787, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:22.817939", "step": 787, "epoch": 1 }, { "type": "loss", "content": 0.3924509584903717, "timestamp": "2025-09-05 08:55:22.827178", "step": 788, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:22.991967", "step": 788, "epoch": 1 }, { "type": "loss", "content": 0.3101724684238434, "timestamp": "2025-09-05 08:55:22.993571", "step": 789, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:23.198900", "step": 789, "epoch": 1 }, { "type": "loss", "content": 0.35364800691604614, "timestamp": "2025-09-05 08:55:23.200466", "step": 790, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:23.395760", "step": 790, "epoch": 1 }, { "type": "loss", "content": 0.29069453477859497, "timestamp": "2025-09-05 08:55:23.398959", "step": 791, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:23.595445", "step": 791, "epoch": 1 }, { "type": "loss", "content": 0.2604013979434967, "timestamp": "2025-09-05 08:55:23.611776", "step": 792, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:23.809541", "step": 792, "epoch": 1 }, { "type": "loss", "content": 0.3716432452201843, "timestamp": "2025-09-05 08:55:23.811288", "step": 793, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:24.006648", "step": 793, "epoch": 1 }, { "type": "loss", "content": 0.2955947518348694, "timestamp": "2025-09-05 08:55:24.008408", "step": 794, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:24.204286", "step": 794, "epoch": 1 }, { "type": "loss", "content": 0.4398670792579651, "timestamp": "2025-09-05 08:55:24.206033", "step": 795, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:24.400419", "step": 795, "epoch": 1 }, { "type": "loss", "content": 0.39011335372924805, "timestamp": "2025-09-05 08:55:24.414895", "step": 796, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:55:24.610922", "step": 796, "epoch": 1 }, { "type": "loss", "content": 0.4034815728664398, "timestamp": "2025-09-05 08:55:24.614525", "step": 797, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:24.815788", "step": 797, "epoch": 1 }, { "type": "loss", "content": 0.33042198419570923, "timestamp": "2025-09-05 08:55:24.817431", "step": 798, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:25.022565", "step": 798, "epoch": 1 }, { "type": "loss", "content": 0.35239022970199585, "timestamp": "2025-09-05 08:55:25.024113", "step": 799, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:25.229432", "step": 799, "epoch": 1 }, { "type": "loss", "content": 0.38791370391845703, "timestamp": "2025-09-05 08:55:25.243544", "step": 800, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:55:29.889311", "step": 800, "epoch": 1 }, { "type": "pplx", "content": 58.95891625814123, "timestamp": "2025-09-05 08:55:29.891231", "step": 800, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 800", "timestamp": "2025-09-05 08:55:30.365184", "step": 800, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:55:30.534979", "step": 800, "epoch": 1 }, { "type": "loss", "content": 0.36771687865257263, "timestamp": "2025-09-05 08:55:30.537058", "step": 801, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:30.733797", "step": 801, "epoch": 1 }, { "type": "loss", "content": 0.265923410654068, "timestamp": "2025-09-05 08:55:30.735543", "step": 802, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:30.940549", "step": 802, "epoch": 1 }, { "type": "loss", "content": 0.3522961437702179, "timestamp": "2025-09-05 08:55:30.942341", "step": 803, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:31.108326", "step": 803, "epoch": 1 }, { "type": "loss", "content": 0.3711682856082916, "timestamp": "2025-09-05 08:55:31.124440", "step": 804, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:31.321294", "step": 804, "epoch": 1 }, { "type": "loss", "content": 0.282815158367157, "timestamp": "2025-09-05 08:55:31.325387", "step": 805, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:31.524159", "step": 805, "epoch": 1 }, { "type": "loss", "content": 0.3416968882083893, "timestamp": "2025-09-05 08:55:31.527357", "step": 806, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:31.725722", "step": 806, "epoch": 1 }, { "type": "loss", "content": 0.35129979252815247, "timestamp": "2025-09-05 08:55:31.729004", "step": 807, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:31.926674", "step": 807, "epoch": 1 }, { "type": "loss", "content": 0.4110635221004486, "timestamp": "2025-09-05 08:55:31.943568", "step": 808, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:32.141794", "step": 808, "epoch": 1 }, { "type": "loss", "content": 0.5824915766716003, "timestamp": "2025-09-05 08:55:32.143517", "step": 809, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:32.348023", "step": 809, "epoch": 1 }, { "type": "loss", "content": 0.4817580282688141, "timestamp": "2025-09-05 08:55:32.350161", "step": 810, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:32.555273", "step": 810, "epoch": 1 }, { "type": "loss", "content": 0.3244217038154602, "timestamp": "2025-09-05 08:55:32.557016", "step": 811, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:32.754146", "step": 811, "epoch": 1 }, { "type": "loss", "content": 0.377000093460083, "timestamp": "2025-09-05 08:55:32.763538", "step": 812, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:32.928523", "step": 812, "epoch": 1 }, { "type": "loss", "content": 0.40550920367240906, "timestamp": "2025-09-05 08:55:32.930496", "step": 813, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:33.135757", "step": 813, "epoch": 1 }, { "type": "loss", "content": 0.3100672662258148, "timestamp": "2025-09-05 08:55:33.138151", "step": 814, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:33.346041", "step": 814, "epoch": 1 }, { "type": "loss", "content": 0.37069234251976013, "timestamp": "2025-09-05 08:55:33.348524", "step": 815, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:33.553773", "step": 815, "epoch": 1 }, { "type": "loss", "content": 0.3425869047641754, "timestamp": "2025-09-05 08:55:33.568801", "step": 816, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:33.756241", "step": 816, "epoch": 1 }, { "type": "loss", "content": 0.4293539524078369, "timestamp": "2025-09-05 08:55:33.758599", "step": 817, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:33.962790", "step": 817, "epoch": 1 }, { "type": "loss", "content": 0.31967583298683167, "timestamp": "2025-09-05 08:55:33.964832", "step": 818, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:34.160870", "step": 818, "epoch": 1 }, { "type": "loss", "content": 0.38507017493247986, "timestamp": "2025-09-05 08:55:34.163142", "step": 819, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:34.333933", "step": 819, "epoch": 1 }, { "type": "loss", "content": 0.42399299144744873, "timestamp": "2025-09-05 08:55:34.349042", "step": 820, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:55:39.001578", "step": 820, "epoch": 1 }, { "type": "pplx", "content": 58.07706559280723, "timestamp": "2025-09-05 08:55:39.003502", "step": 820, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:39.163673", "step": 820, "epoch": 1 }, { "type": "loss", "content": 0.3387710452079773, "timestamp": "2025-09-05 08:55:39.165694", "step": 821, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:39.333573", "step": 821, "epoch": 1 }, { "type": "loss", "content": 0.2951369285583496, "timestamp": "2025-09-05 08:55:39.335550", "step": 822, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:39.543460", "step": 822, "epoch": 1 }, { "type": "loss", "content": 0.35738304257392883, "timestamp": "2025-09-05 08:55:39.546054", "step": 823, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:39.742891", "step": 823, "epoch": 1 }, { "type": "loss", "content": 0.3632713258266449, "timestamp": "2025-09-05 08:55:39.756880", "step": 824, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:39.953686", "step": 824, "epoch": 1 }, { "type": "loss", "content": 0.37211427092552185, "timestamp": "2025-09-05 08:55:39.955910", "step": 825, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:40.162235", "step": 825, "epoch": 1 }, { "type": "loss", "content": 0.3463480472564697, "timestamp": "2025-09-05 08:55:40.164065", "step": 826, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:40.371292", "step": 826, "epoch": 1 }, { "type": "loss", "content": 0.30579307675361633, "timestamp": "2025-09-05 08:55:40.373150", "step": 827, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:40.580090", "step": 827, "epoch": 1 }, { "type": "loss", "content": 0.40940576791763306, "timestamp": "2025-09-05 08:55:40.594902", "step": 828, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:40.792047", "step": 828, "epoch": 1 }, { "type": "loss", "content": 0.36023303866386414, "timestamp": "2025-09-05 08:55:40.793964", "step": 829, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:40.990352", "step": 829, "epoch": 1 }, { "type": "loss", "content": 0.3491280972957611, "timestamp": "2025-09-05 08:55:40.992165", "step": 830, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:41.188266", "step": 830, "epoch": 1 }, { "type": "loss", "content": 0.24948126077651978, "timestamp": "2025-09-05 08:55:41.189890", "step": 831, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:41.387216", "step": 831, "epoch": 1 }, { "type": "loss", "content": 0.31294217705726624, "timestamp": "2025-09-05 08:55:41.401260", "step": 832, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:41.590851", "step": 832, "epoch": 1 }, { "type": "loss", "content": 0.4010050594806671, "timestamp": "2025-09-05 08:55:41.592639", "step": 833, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:41.788041", "step": 833, "epoch": 1 }, { "type": "loss", "content": 0.3963695466518402, "timestamp": "2025-09-05 08:55:41.790319", "step": 834, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:41.986059", "step": 834, "epoch": 1 }, { "type": "loss", "content": 0.4662969708442688, "timestamp": "2025-09-05 08:55:41.988012", "step": 835, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:42.185669", "step": 835, "epoch": 1 }, { "type": "loss", "content": 0.44282209873199463, "timestamp": "2025-09-05 08:55:42.199987", "step": 836, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:42.388364", "step": 836, "epoch": 1 }, { "type": "loss", "content": 0.2699596881866455, "timestamp": "2025-09-05 08:55:42.390059", "step": 837, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:42.555606", "step": 837, "epoch": 1 }, { "type": "loss", "content": 0.42487242817878723, "timestamp": "2025-09-05 08:55:42.557823", "step": 838, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:42.763983", "step": 838, "epoch": 1 }, { "type": "loss", "content": 0.25971612334251404, "timestamp": "2025-09-05 08:55:42.766011", "step": 839, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:55:42.964541", "step": 839, "epoch": 1 }, { "type": "loss", "content": 0.3642326593399048, "timestamp": "2025-09-05 08:55:42.978888", "step": 840, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:55:47.651856", "step": 840, "epoch": 1 }, { "type": "pplx", "content": 58.15736728059577, "timestamp": "2025-09-05 08:55:47.653828", "step": 840, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 840", "timestamp": "2025-09-05 08:55:48.144064", "step": 840, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:48.312663", "step": 840, "epoch": 1 }, { "type": "loss", "content": 0.517951488494873, "timestamp": "2025-09-05 08:55:48.314430", "step": 841, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:48.508434", "step": 841, "epoch": 1 }, { "type": "loss", "content": 0.297713965177536, "timestamp": "2025-09-05 08:55:48.510702", "step": 842, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:48.707704", "step": 842, "epoch": 1 }, { "type": "loss", "content": 0.4420361816883087, "timestamp": "2025-09-05 08:55:48.709846", "step": 843, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:48.876016", "step": 843, "epoch": 1 }, { "type": "loss", "content": 0.4004204273223877, "timestamp": "2025-09-05 08:55:48.892549", "step": 844, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:49.090938", "step": 844, "epoch": 1 }, { "type": "loss", "content": 0.3982437551021576, "timestamp": "2025-09-05 08:55:49.092914", "step": 845, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:49.261019", "step": 845, "epoch": 1 }, { "type": "loss", "content": 0.36752861738204956, "timestamp": "2025-09-05 08:55:49.265198", "step": 846, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:49.460827", "step": 846, "epoch": 1 }, { "type": "loss", "content": 0.2838650941848755, "timestamp": "2025-09-05 08:55:49.463902", "step": 847, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:49.673060", "step": 847, "epoch": 1 }, { "type": "loss", "content": 0.3069932758808136, "timestamp": "2025-09-05 08:55:49.687326", "step": 848, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:49.877164", "step": 848, "epoch": 1 }, { "type": "loss", "content": 0.43903687596321106, "timestamp": "2025-09-05 08:55:49.879086", "step": 849, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:50.075926", "step": 849, "epoch": 1 }, { "type": "loss", "content": 0.2802756428718567, "timestamp": "2025-09-05 08:55:50.078314", "step": 850, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:50.285280", "step": 850, "epoch": 1 }, { "type": "loss", "content": 0.24855327606201172, "timestamp": "2025-09-05 08:55:50.287275", "step": 851, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:50.489334", "step": 851, "epoch": 1 }, { "type": "loss", "content": 0.4161975681781769, "timestamp": "2025-09-05 08:55:50.503796", "step": 852, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:50.695106", "step": 852, "epoch": 1 }, { "type": "loss", "content": 0.45736148953437805, "timestamp": "2025-09-05 08:55:50.698191", "step": 853, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:50.903392", "step": 853, "epoch": 1 }, { "type": "loss", "content": 0.36564257740974426, "timestamp": "2025-09-05 08:55:50.905223", "step": 854, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:51.102891", "step": 854, "epoch": 1 }, { "type": "loss", "content": 0.37967172265052795, "timestamp": "2025-09-05 08:55:51.105298", "step": 855, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:51.304529", "step": 855, "epoch": 1 }, { "type": "loss", "content": 0.4026603400707245, "timestamp": "2025-09-05 08:55:51.321055", "step": 856, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:51.519946", "step": 856, "epoch": 1 }, { "type": "loss", "content": 0.22071857750415802, "timestamp": "2025-09-05 08:55:51.522074", "step": 857, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:51.720784", "step": 857, "epoch": 1 }, { "type": "loss", "content": 0.47518184781074524, "timestamp": "2025-09-05 08:55:51.722581", "step": 858, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:51.928566", "step": 858, "epoch": 1 }, { "type": "loss", "content": 0.3092558681964874, "timestamp": "2025-09-05 08:55:51.930334", "step": 859, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:52.127465", "step": 859, "epoch": 1 }, { "type": "loss", "content": 0.2976468801498413, "timestamp": "2025-09-05 08:55:52.142114", "step": 860, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:55:56.850637", "step": 860, "epoch": 1 }, { "type": "pplx", "content": 58.433954195954186, "timestamp": "2025-09-05 08:55:56.852647", "step": 860, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:57.015085", "step": 860, "epoch": 1 }, { "type": "loss", "content": 0.3771287798881531, "timestamp": "2025-09-05 08:55:57.016999", "step": 861, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:57.221522", "step": 861, "epoch": 1 }, { "type": "loss", "content": 0.4275374710559845, "timestamp": "2025-09-05 08:55:57.223447", "step": 862, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:57.432116", "step": 862, "epoch": 1 }, { "type": "loss", "content": 0.4731064438819885, "timestamp": "2025-09-05 08:55:57.434290", "step": 863, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:57.632130", "step": 863, "epoch": 1 }, { "type": "loss", "content": 0.2372477501630783, "timestamp": "2025-09-05 08:55:57.646792", "step": 864, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:57.846296", "step": 864, "epoch": 1 }, { "type": "loss", "content": 0.328989714384079, "timestamp": "2025-09-05 08:55:57.848392", "step": 865, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:55:58.048431", "step": 865, "epoch": 1 }, { "type": "loss", "content": 0.43380752205848694, "timestamp": "2025-09-05 08:55:58.050531", "step": 866, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:58.248313", "step": 866, "epoch": 1 }, { "type": "loss", "content": 0.31248700618743896, "timestamp": "2025-09-05 08:55:58.250155", "step": 867, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:55:58.449096", "step": 867, "epoch": 1 }, { "type": "loss", "content": 0.3782794773578644, "timestamp": "2025-09-05 08:55:58.463518", "step": 868, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:55:58.655068", "step": 868, "epoch": 1 }, { "type": "loss", "content": 0.46563515067100525, "timestamp": "2025-09-05 08:55:58.657655", "step": 869, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:58.853817", "step": 869, "epoch": 1 }, { "type": "loss", "content": 0.30388137698173523, "timestamp": "2025-09-05 08:55:58.855603", "step": 870, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:59.065956", "step": 870, "epoch": 1 }, { "type": "loss", "content": 0.27287933230400085, "timestamp": "2025-09-05 08:55:59.067801", "step": 871, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:55:59.267381", "step": 871, "epoch": 1 }, { "type": "loss", "content": 0.23755168914794922, "timestamp": "2025-09-05 08:55:59.284414", "step": 872, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:59.482355", "step": 872, "epoch": 1 }, { "type": "loss", "content": 0.37823018431663513, "timestamp": "2025-09-05 08:55:59.484362", "step": 873, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:55:59.690986", "step": 873, "epoch": 1 }, { "type": "loss", "content": 0.4183447062969208, "timestamp": "2025-09-05 08:55:59.694146", "step": 874, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:55:59.902530", "step": 874, "epoch": 1 }, { "type": "loss", "content": 0.36634987592697144, "timestamp": "2025-09-05 08:55:59.904278", "step": 875, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:00.112019", "step": 875, "epoch": 1 }, { "type": "loss", "content": 0.5448222756385803, "timestamp": "2025-09-05 08:56:00.126417", "step": 876, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:00.323866", "step": 876, "epoch": 1 }, { "type": "loss", "content": 0.38981127738952637, "timestamp": "2025-09-05 08:56:00.325658", "step": 877, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:00.533301", "step": 877, "epoch": 1 }, { "type": "loss", "content": 0.3060409724712372, "timestamp": "2025-09-05 08:56:00.535138", "step": 878, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:00.732164", "step": 878, "epoch": 1 }, { "type": "loss", "content": 0.23060426115989685, "timestamp": "2025-09-05 08:56:00.733955", "step": 879, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:00.930960", "step": 879, "epoch": 1 }, { "type": "loss", "content": 0.3777828514575958, "timestamp": "2025-09-05 08:56:00.945178", "step": 880, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:56:05.651096", "step": 880, "epoch": 1 }, { "type": "pplx", "content": 58.9930927717275, "timestamp": "2025-09-05 08:56:05.653602", "step": 880, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 880", "timestamp": "2025-09-05 08:56:06.119224", "step": 880, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:56:06.284103", "step": 880, "epoch": 1 }, { "type": "loss", "content": 0.35168516635894775, "timestamp": "2025-09-05 08:56:06.286070", "step": 881, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:56:06.485122", "step": 881, "epoch": 1 }, { "type": "loss", "content": 0.4242359697818756, "timestamp": "2025-09-05 08:56:06.487493", "step": 882, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:06.658498", "step": 882, "epoch": 1 }, { "type": "loss", "content": 0.3520510792732239, "timestamp": "2025-09-05 08:56:06.660398", "step": 883, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:06.866185", "step": 883, "epoch": 1 }, { "type": "loss", "content": 0.2529168725013733, "timestamp": "2025-09-05 08:56:06.880043", "step": 884, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:56:07.068068", "step": 884, "epoch": 1 }, { "type": "loss", "content": 0.4118684232234955, "timestamp": "2025-09-05 08:56:07.071747", "step": 885, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:07.283773", "step": 885, "epoch": 1 }, { "type": "loss", "content": 0.2805653512477875, "timestamp": "2025-09-05 08:56:07.290307", "step": 886, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:07.498225", "step": 886, "epoch": 1 }, { "type": "loss", "content": 0.3658747971057892, "timestamp": "2025-09-05 08:56:07.503757", "step": 887, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:07.723427", "step": 887, "epoch": 1 }, { "type": "loss", "content": 0.5264832973480225, "timestamp": "2025-09-05 08:56:07.738249", "step": 888, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:07.931407", "step": 888, "epoch": 1 }, { "type": "loss", "content": 0.4625481069087982, "timestamp": "2025-09-05 08:56:07.934631", "step": 889, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:08.137874", "step": 889, "epoch": 1 }, { "type": "loss", "content": 0.2717623710632324, "timestamp": "2025-09-05 08:56:08.141167", "step": 890, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:08.355120", "step": 890, "epoch": 1 }, { "type": "loss", "content": 0.26788651943206787, "timestamp": "2025-09-05 08:56:08.357651", "step": 891, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:08.556363", "step": 891, "epoch": 1 }, { "type": "loss", "content": 0.43440476059913635, "timestamp": "2025-09-05 08:56:08.572805", "step": 892, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:08.763294", "step": 892, "epoch": 1 }, { "type": "loss", "content": 0.46988850831985474, "timestamp": "2025-09-05 08:56:08.765546", "step": 893, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:08.975242", "step": 893, "epoch": 1 }, { "type": "loss", "content": 0.35338684916496277, "timestamp": "2025-09-05 08:56:08.977587", "step": 894, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:09.196975", "step": 894, "epoch": 1 }, { "type": "loss", "content": 0.3955008089542389, "timestamp": "2025-09-05 08:56:09.205880", "step": 895, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:09.416322", "step": 895, "epoch": 1 }, { "type": "loss", "content": 0.4844711422920227, "timestamp": "2025-09-05 08:56:09.435475", "step": 896, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:09.636116", "step": 896, "epoch": 1 }, { "type": "loss", "content": 0.38304269313812256, "timestamp": "2025-09-05 08:56:09.640059", "step": 897, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:09.844074", "step": 897, "epoch": 1 }, { "type": "loss", "content": 0.4427229166030884, "timestamp": "2025-09-05 08:56:09.846695", "step": 898, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:10.052111", "step": 898, "epoch": 1 }, { "type": "loss", "content": 0.31587520241737366, "timestamp": "2025-09-05 08:56:10.054379", "step": 899, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:10.262257", "step": 899, "epoch": 1 }, { "type": "loss", "content": 0.2594102919101715, "timestamp": "2025-09-05 08:56:10.276857", "step": 900, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:56:14.955511", "step": 900, "epoch": 1 }, { "type": "pplx", "content": 58.8565162716663, "timestamp": "2025-09-05 08:56:14.957342", "step": 900, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:56:15.119484", "step": 900, "epoch": 1 }, { "type": "loss", "content": 0.3601221740245819, "timestamp": "2025-09-05 08:56:15.121057", "step": 901, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:15.326247", "step": 901, "epoch": 1 }, { "type": "loss", "content": 0.296526163816452, "timestamp": "2025-09-05 08:56:15.327954", "step": 902, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:15.534820", "step": 902, "epoch": 1 }, { "type": "loss", "content": 0.2748814821243286, "timestamp": "2025-09-05 08:56:15.537456", "step": 903, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:15.735119", "step": 903, "epoch": 1 }, { "type": "loss", "content": 0.31274929642677307, "timestamp": "2025-09-05 08:56:15.752496", "step": 904, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:15.949953", "step": 904, "epoch": 1 }, { "type": "loss", "content": 0.45903486013412476, "timestamp": "2025-09-05 08:56:15.952321", "step": 905, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:16.160410", "step": 905, "epoch": 1 }, { "type": "loss", "content": 0.32286933064460754, "timestamp": "2025-09-05 08:56:16.162171", "step": 906, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:16.359899", "step": 906, "epoch": 1 }, { "type": "loss", "content": 0.3217444121837616, "timestamp": "2025-09-05 08:56:16.361873", "step": 907, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:16.556943", "step": 907, "epoch": 1 }, { "type": "loss", "content": 0.2954324185848236, "timestamp": "2025-09-05 08:56:16.571357", "step": 908, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:16.759132", "step": 908, "epoch": 1 }, { "type": "loss", "content": 0.3679000735282898, "timestamp": "2025-09-05 08:56:16.761079", "step": 909, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:16.959729", "step": 909, "epoch": 1 }, { "type": "loss", "content": 0.41062143445014954, "timestamp": "2025-09-05 08:56:16.961531", "step": 910, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:17.157829", "step": 910, "epoch": 1 }, { "type": "loss", "content": 0.36205968260765076, "timestamp": "2025-09-05 08:56:17.160372", "step": 911, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:56:17.356759", "step": 911, "epoch": 1 }, { "type": "loss", "content": 0.3409394323825836, "timestamp": "2025-09-05 08:56:17.365973", "step": 912, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:17.529633", "step": 912, "epoch": 1 }, { "type": "loss", "content": 0.31686097383499146, "timestamp": "2025-09-05 08:56:17.531168", "step": 913, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:17.697289", "step": 913, "epoch": 1 }, { "type": "loss", "content": 0.22255216538906097, "timestamp": "2025-09-05 08:56:17.699236", "step": 914, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:17.905578", "step": 914, "epoch": 1 }, { "type": "loss", "content": 0.30591610074043274, "timestamp": "2025-09-05 08:56:17.907210", "step": 915, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:18.103761", "step": 915, "epoch": 1 }, { "type": "loss", "content": 0.36227649450302124, "timestamp": "2025-09-05 08:56:18.118540", "step": 916, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:18.307208", "step": 916, "epoch": 1 }, { "type": "loss", "content": 0.29489269852638245, "timestamp": "2025-09-05 08:56:18.308973", "step": 917, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:18.506067", "step": 917, "epoch": 1 }, { "type": "loss", "content": 0.3366081118583679, "timestamp": "2025-09-05 08:56:18.507597", "step": 918, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:18.716065", "step": 918, "epoch": 1 }, { "type": "loss", "content": 0.2996135652065277, "timestamp": "2025-09-05 08:56:18.717925", "step": 919, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:18.916645", "step": 919, "epoch": 1 }, { "type": "loss", "content": 0.31638243794441223, "timestamp": "2025-09-05 08:56:18.930921", "step": 920, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:56:23.565560", "step": 920, "epoch": 1 }, { "type": "pplx", "content": 58.30528894827069, "timestamp": "2025-09-05 08:56:23.567482", "step": 920, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 920", "timestamp": "2025-09-05 08:56:24.094332", "step": 920, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:24.266169", "step": 920, "epoch": 1 }, { "type": "loss", "content": 0.31981584429740906, "timestamp": "2025-09-05 08:56:24.268944", "step": 921, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:24.440635", "step": 921, "epoch": 1 }, { "type": "loss", "content": 0.4614734649658203, "timestamp": "2025-09-05 08:56:24.442107", "step": 922, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:24.647745", "step": 922, "epoch": 1 }, { "type": "loss", "content": 0.3498396575450897, "timestamp": "2025-09-05 08:56:24.649674", "step": 923, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:24.855227", "step": 923, "epoch": 1 }, { "type": "loss", "content": 0.2465495467185974, "timestamp": "2025-09-05 08:56:24.864527", "step": 924, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:25.029444", "step": 924, "epoch": 1 }, { "type": "loss", "content": 0.5145090818405151, "timestamp": "2025-09-05 08:56:25.031415", "step": 925, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:25.238088", "step": 925, "epoch": 1 }, { "type": "loss", "content": 0.3693583607673645, "timestamp": "2025-09-05 08:56:25.239819", "step": 926, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:25.437346", "step": 926, "epoch": 1 }, { "type": "loss", "content": 0.33343541622161865, "timestamp": "2025-09-05 08:56:25.439379", "step": 927, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:25.644585", "step": 927, "epoch": 1 }, { "type": "loss", "content": 0.28294891119003296, "timestamp": "2025-09-05 08:56:25.659020", "step": 928, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:25.847035", "step": 928, "epoch": 1 }, { "type": "loss", "content": 0.3684309720993042, "timestamp": "2025-09-05 08:56:25.848699", "step": 929, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:26.044378", "step": 929, "epoch": 1 }, { "type": "loss", "content": 0.3786807060241699, "timestamp": "2025-09-05 08:56:26.046519", "step": 930, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:26.252989", "step": 930, "epoch": 1 }, { "type": "loss", "content": 0.2511294186115265, "timestamp": "2025-09-05 08:56:26.254868", "step": 931, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:26.451301", "step": 931, "epoch": 1 }, { "type": "loss", "content": 0.26673150062561035, "timestamp": "2025-09-05 08:56:26.469187", "step": 932, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:26.664664", "step": 932, "epoch": 1 }, { "type": "loss", "content": 0.2586221396923065, "timestamp": "2025-09-05 08:56:26.667023", "step": 933, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:26.861586", "step": 933, "epoch": 1 }, { "type": "loss", "content": 0.23219725489616394, "timestamp": "2025-09-05 08:56:26.864119", "step": 934, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:27.060475", "step": 934, "epoch": 1 }, { "type": "loss", "content": 0.3612004816532135, "timestamp": "2025-09-05 08:56:27.062849", "step": 935, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:27.258572", "step": 935, "epoch": 1 }, { "type": "loss", "content": 0.19465096294879913, "timestamp": "2025-09-05 08:56:27.274029", "step": 936, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:27.461704", "step": 936, "epoch": 1 }, { "type": "loss", "content": 0.2936038374900818, "timestamp": "2025-09-05 08:56:27.464576", "step": 937, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:27.661307", "step": 937, "epoch": 1 }, { "type": "loss", "content": 0.34567755460739136, "timestamp": "2025-09-05 08:56:27.663756", "step": 938, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:27.869389", "step": 938, "epoch": 1 }, { "type": "loss", "content": 0.45169833302497864, "timestamp": "2025-09-05 08:56:27.873145", "step": 939, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:28.068724", "step": 939, "epoch": 1 }, { "type": "loss", "content": 0.41843459010124207, "timestamp": "2025-09-05 08:56:28.083793", "step": 940, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:56:32.727257", "step": 940, "epoch": 1 }, { "type": "pplx", "content": 57.96766332265921, "timestamp": "2025-09-05 08:56:32.729095", "step": 940, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:32.889362", "step": 940, "epoch": 1 }, { "type": "loss", "content": 0.33605149388313293, "timestamp": "2025-09-05 08:56:32.891081", "step": 941, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:33.057909", "step": 941, "epoch": 1 }, { "type": "loss", "content": 0.28017252683639526, "timestamp": "2025-09-05 08:56:33.059718", "step": 942, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:33.266409", "step": 942, "epoch": 1 }, { "type": "loss", "content": 0.32622429728507996, "timestamp": "2025-09-05 08:56:33.268164", "step": 943, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:33.435862", "step": 943, "epoch": 1 }, { "type": "loss", "content": 0.3794481158256531, "timestamp": "2025-09-05 08:56:33.452555", "step": 944, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:33.650667", "step": 944, "epoch": 1 }, { "type": "loss", "content": 0.32684630155563354, "timestamp": "2025-09-05 08:56:33.652374", "step": 945, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:33.848749", "step": 945, "epoch": 1 }, { "type": "loss", "content": 0.34910067915916443, "timestamp": "2025-09-05 08:56:33.850516", "step": 946, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:56:34.052773", "step": 946, "epoch": 1 }, { "type": "loss", "content": 0.39786896109580994, "timestamp": "2025-09-05 08:56:34.054439", "step": 947, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:34.261687", "step": 947, "epoch": 1 }, { "type": "loss", "content": 0.410081684589386, "timestamp": "2025-09-05 08:56:34.276444", "step": 948, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:56:34.466684", "step": 948, "epoch": 1 }, { "type": "loss", "content": 0.5023239850997925, "timestamp": "2025-09-05 08:56:34.468684", "step": 949, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:34.664812", "step": 949, "epoch": 1 }, { "type": "loss", "content": 0.29403749108314514, "timestamp": "2025-09-05 08:56:34.667965", "step": 950, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:34.863532", "step": 950, "epoch": 1 }, { "type": "loss", "content": 0.2468639761209488, "timestamp": "2025-09-05 08:56:34.865338", "step": 951, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:35.063835", "step": 951, "epoch": 1 }, { "type": "loss", "content": 0.41007065773010254, "timestamp": "2025-09-05 08:56:35.077372", "step": 952, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:35.272719", "step": 952, "epoch": 1 }, { "type": "loss", "content": 0.26287195086479187, "timestamp": "2025-09-05 08:56:35.275516", "step": 953, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:35.442519", "step": 953, "epoch": 1 }, { "type": "loss", "content": 0.4006151854991913, "timestamp": "2025-09-05 08:56:35.444495", "step": 954, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:35.650340", "step": 954, "epoch": 1 }, { "type": "loss", "content": 0.25933578610420227, "timestamp": "2025-09-05 08:56:35.653205", "step": 955, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:35.856923", "step": 955, "epoch": 1 }, { "type": "loss", "content": 0.2387603372335434, "timestamp": "2025-09-05 08:56:35.873711", "step": 956, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:36.072514", "step": 956, "epoch": 1 }, { "type": "loss", "content": 0.3455435633659363, "timestamp": "2025-09-05 08:56:36.074229", "step": 957, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:36.270676", "step": 957, "epoch": 1 }, { "type": "loss", "content": 0.42304477095603943, "timestamp": "2025-09-05 08:56:36.272861", "step": 958, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:36.470360", "step": 958, "epoch": 1 }, { "type": "loss", "content": 0.5853138566017151, "timestamp": "2025-09-05 08:56:36.472351", "step": 959, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:56:36.678570", "step": 959, "epoch": 1 }, { "type": "loss", "content": 0.4282821714878082, "timestamp": "2025-09-05 08:56:36.696107", "step": 960, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:56:41.365028", "step": 960, "epoch": 1 }, { "type": "pplx", "content": 58.0195968066891, "timestamp": "2025-09-05 08:56:41.367535", "step": 960, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 960", "timestamp": "2025-09-05 08:56:41.827347", "step": 960, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:41.991345", "step": 960, "epoch": 1 }, { "type": "loss", "content": 0.38176435232162476, "timestamp": "2025-09-05 08:56:41.992895", "step": 961, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:42.188364", "step": 961, "epoch": 1 }, { "type": "loss", "content": 0.3238293528556824, "timestamp": "2025-09-05 08:56:42.190547", "step": 962, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:42.387838", "step": 962, "epoch": 1 }, { "type": "loss", "content": 0.37185245752334595, "timestamp": "2025-09-05 08:56:42.389442", "step": 963, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:42.587007", "step": 963, "epoch": 1 }, { "type": "loss", "content": 0.33915528655052185, "timestamp": "2025-09-05 08:56:42.601403", "step": 964, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:42.789914", "step": 964, "epoch": 1 }, { "type": "loss", "content": 0.4303402900695801, "timestamp": "2025-09-05 08:56:42.792185", "step": 965, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:42.989673", "step": 965, "epoch": 1 }, { "type": "loss", "content": 0.41477546095848083, "timestamp": "2025-09-05 08:56:42.991198", "step": 966, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:43.201206", "step": 966, "epoch": 1 }, { "type": "loss", "content": 0.3982813060283661, "timestamp": "2025-09-05 08:56:43.202897", "step": 967, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:43.399469", "step": 967, "epoch": 1 }, { "type": "loss", "content": 0.2835683822631836, "timestamp": "2025-09-05 08:56:43.413180", "step": 968, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:56:43.601190", "step": 968, "epoch": 1 }, { "type": "loss", "content": 0.39541733264923096, "timestamp": "2025-09-05 08:56:43.606800", "step": 969, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:43.812853", "step": 969, "epoch": 1 }, { "type": "loss", "content": 0.37673527002334595, "timestamp": "2025-09-05 08:56:43.814961", "step": 970, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:44.012986", "step": 970, "epoch": 1 }, { "type": "loss", "content": 0.43757012486457825, "timestamp": "2025-09-05 08:56:44.014992", "step": 971, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:44.219053", "step": 971, "epoch": 1 }, { "type": "loss", "content": 0.34792765974998474, "timestamp": "2025-09-05 08:56:44.236654", "step": 972, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:44.435682", "step": 972, "epoch": 1 }, { "type": "loss", "content": 0.2519514858722687, "timestamp": "2025-09-05 08:56:44.437511", "step": 973, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:44.644146", "step": 973, "epoch": 1 }, { "type": "loss", "content": 0.37428492307662964, "timestamp": "2025-09-05 08:56:44.645999", "step": 974, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:44.842027", "step": 974, "epoch": 1 }, { "type": "loss", "content": 0.4176678955554962, "timestamp": "2025-09-05 08:56:44.843935", "step": 975, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:45.041995", "step": 975, "epoch": 1 }, { "type": "loss", "content": 0.32866379618644714, "timestamp": "2025-09-05 08:56:45.056419", "step": 976, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:45.253585", "step": 976, "epoch": 1 }, { "type": "loss", "content": 0.3537275493144989, "timestamp": "2025-09-05 08:56:45.255728", "step": 977, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:45.462343", "step": 977, "epoch": 1 }, { "type": "loss", "content": 0.39139440655708313, "timestamp": "2025-09-05 08:56:45.464065", "step": 978, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:45.660402", "step": 978, "epoch": 1 }, { "type": "loss", "content": 0.47776591777801514, "timestamp": "2025-09-05 08:56:45.662496", "step": 979, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:45.870532", "step": 979, "epoch": 1 }, { "type": "loss", "content": 0.39145949482917786, "timestamp": "2025-09-05 08:56:45.884767", "step": 980, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:56:50.525816", "step": 980, "epoch": 1 }, { "type": "pplx", "content": 57.20121236465665, "timestamp": "2025-09-05 08:56:50.527616", "step": 980, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:50.691990", "step": 980, "epoch": 1 }, { "type": "loss", "content": 0.34527283906936646, "timestamp": "2025-09-05 08:56:50.694050", "step": 981, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:50.861530", "step": 981, "epoch": 1 }, { "type": "loss", "content": 0.43041056394577026, "timestamp": "2025-09-05 08:56:50.863361", "step": 982, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:51.068897", "step": 982, "epoch": 1 }, { "type": "loss", "content": 0.32550325989723206, "timestamp": "2025-09-05 08:56:51.070801", "step": 983, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:51.276949", "step": 983, "epoch": 1 }, { "type": "loss", "content": 0.3620685040950775, "timestamp": "2025-09-05 08:56:51.291433", "step": 984, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:51.479999", "step": 984, "epoch": 1 }, { "type": "loss", "content": 0.36788034439086914, "timestamp": "2025-09-05 08:56:51.482550", "step": 985, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:51.678037", "step": 985, "epoch": 1 }, { "type": "loss", "content": 0.20247577130794525, "timestamp": "2025-09-05 08:56:51.680666", "step": 986, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:51.887299", "step": 986, "epoch": 1 }, { "type": "loss", "content": 0.255616694688797, "timestamp": "2025-09-05 08:56:51.889216", "step": 987, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:52.085171", "step": 987, "epoch": 1 }, { "type": "loss", "content": 0.32949069142341614, "timestamp": "2025-09-05 08:56:52.099490", "step": 988, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:52.298089", "step": 988, "epoch": 1 }, { "type": "loss", "content": 0.2952141761779785, "timestamp": "2025-09-05 08:56:52.300055", "step": 989, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:56:52.495645", "step": 989, "epoch": 1 }, { "type": "loss", "content": 0.3354770541191101, "timestamp": "2025-09-05 08:56:52.497604", "step": 990, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:52.696906", "step": 990, "epoch": 1 }, { "type": "loss", "content": 0.4000490605831146, "timestamp": "2025-09-05 08:56:52.699540", "step": 991, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:52.895826", "step": 991, "epoch": 1 }, { "type": "loss", "content": 0.400611937046051, "timestamp": "2025-09-05 08:56:52.905288", "step": 992, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:53.068126", "step": 992, "epoch": 1 }, { "type": "loss", "content": 0.41154736280441284, "timestamp": "2025-09-05 08:56:53.069916", "step": 993, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:53.289245", "step": 993, "epoch": 1 }, { "type": "loss", "content": 0.3314960300922394, "timestamp": "2025-09-05 08:56:53.291054", "step": 994, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:53.499477", "step": 994, "epoch": 1 }, { "type": "loss", "content": 0.3384896218776703, "timestamp": "2025-09-05 08:56:53.501707", "step": 995, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:56:53.708716", "step": 995, "epoch": 1 }, { "type": "loss", "content": 0.24565917253494263, "timestamp": "2025-09-05 08:56:53.723148", "step": 996, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:53.914157", "step": 996, "epoch": 1 }, { "type": "loss", "content": 0.30330580472946167, "timestamp": "2025-09-05 08:56:53.915892", "step": 997, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:56:54.113693", "step": 997, "epoch": 1 }, { "type": "loss", "content": 0.339424192905426, "timestamp": "2025-09-05 08:56:54.116077", "step": 998, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:54.321917", "step": 998, "epoch": 1 }, { "type": "loss", "content": 0.3887946605682373, "timestamp": "2025-09-05 08:56:54.323779", "step": 999, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:56:54.523115", "step": 999, "epoch": 1 }, { "type": "loss", "content": 0.3614341616630554, "timestamp": "2025-09-05 08:56:54.537460", "step": 1000, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:56:59.200485", "step": 1000, "epoch": 1 }, { "type": "pplx", "content": 56.91533872503707, "timestamp": "2025-09-05 08:56:59.202606", "step": 1000, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1000", "timestamp": "2025-09-05 08:56:59.653697", "step": 1000, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:56:59.817222", "step": 1000, "epoch": 1 }, { "type": "loss", "content": 0.39639365673065186, "timestamp": "2025-09-05 08:56:59.819075", "step": 1001, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:56:59.988417", "step": 1001, "epoch": 1 }, { "type": "loss", "content": 0.34922510385513306, "timestamp": "2025-09-05 08:56:59.990486", "step": 1002, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:00.196565", "step": 1002, "epoch": 1 }, { "type": "loss", "content": 0.3727569878101349, "timestamp": "2025-09-05 08:57:00.198531", "step": 1003, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:00.366728", "step": 1003, "epoch": 1 }, { "type": "loss", "content": 0.4180833697319031, "timestamp": "2025-09-05 08:57:00.381916", "step": 1004, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:00.579962", "step": 1004, "epoch": 1 }, { "type": "loss", "content": 0.3678615689277649, "timestamp": "2025-09-05 08:57:00.582147", "step": 1005, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:00.782046", "step": 1005, "epoch": 1 }, { "type": "loss", "content": 0.4152246415615082, "timestamp": "2025-09-05 08:57:00.783965", "step": 1006, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:00.983251", "step": 1006, "epoch": 1 }, { "type": "loss", "content": 0.25905001163482666, "timestamp": "2025-09-05 08:57:00.985125", "step": 1007, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:01.153313", "step": 1007, "epoch": 1 }, { "type": "loss", "content": 0.32605719566345215, "timestamp": "2025-09-05 08:57:01.168446", "step": 1008, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:01.357148", "step": 1008, "epoch": 1 }, { "type": "loss", "content": 0.23667758703231812, "timestamp": "2025-09-05 08:57:01.359310", "step": 1009, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:01.556187", "step": 1009, "epoch": 1 }, { "type": "loss", "content": 0.32055971026420593, "timestamp": "2025-09-05 08:57:01.558142", "step": 1010, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:01.754795", "step": 1010, "epoch": 1 }, { "type": "loss", "content": 0.3258388340473175, "timestamp": "2025-09-05 08:57:01.757186", "step": 1011, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:01.957019", "step": 1011, "epoch": 1 }, { "type": "loss", "content": 0.3455141484737396, "timestamp": "2025-09-05 08:57:01.971080", "step": 1012, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:02.159089", "step": 1012, "epoch": 1 }, { "type": "loss", "content": 0.33423060178756714, "timestamp": "2025-09-05 08:57:02.161397", "step": 1013, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:02.360107", "step": 1013, "epoch": 1 }, { "type": "loss", "content": 0.2707538902759552, "timestamp": "2025-09-05 08:57:02.364072", "step": 1014, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:02.568044", "step": 1014, "epoch": 1 }, { "type": "loss", "content": 0.3885761499404907, "timestamp": "2025-09-05 08:57:02.570516", "step": 1015, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:02.768993", "step": 1015, "epoch": 1 }, { "type": "loss", "content": 0.26470303535461426, "timestamp": "2025-09-05 08:57:02.783878", "step": 1016, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:02.975347", "step": 1016, "epoch": 1 }, { "type": "loss", "content": 0.3257010877132416, "timestamp": "2025-09-05 08:57:02.977993", "step": 1017, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:03.175106", "step": 1017, "epoch": 1 }, { "type": "loss", "content": 0.34169089794158936, "timestamp": "2025-09-05 08:57:03.176698", "step": 1018, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:03.375475", "step": 1018, "epoch": 1 }, { "type": "loss", "content": 0.3442467749118805, "timestamp": "2025-09-05 08:57:03.377243", "step": 1019, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:03.575558", "step": 1019, "epoch": 1 }, { "type": "loss", "content": 0.3116152286529541, "timestamp": "2025-09-05 08:57:03.592015", "step": 1020, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:57:08.258421", "step": 1020, "epoch": 1 }, { "type": "pplx", "content": 56.72869426129927, "timestamp": "2025-09-05 08:57:08.260263", "step": 1020, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:08.421747", "step": 1020, "epoch": 1 }, { "type": "loss", "content": 0.40635350346565247, "timestamp": "2025-09-05 08:57:08.424564", "step": 1021, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:08.591844", "step": 1021, "epoch": 1 }, { "type": "loss", "content": 0.5129305720329285, "timestamp": "2025-09-05 08:57:08.593715", "step": 1022, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:08.799002", "step": 1022, "epoch": 1 }, { "type": "loss", "content": 0.3052510917186737, "timestamp": "2025-09-05 08:57:08.800799", "step": 1023, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:08.997545", "step": 1023, "epoch": 1 }, { "type": "loss", "content": 0.29209795594215393, "timestamp": "2025-09-05 08:57:09.011812", "step": 1024, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:09.200301", "step": 1024, "epoch": 1 }, { "type": "loss", "content": 0.4792327284812927, "timestamp": "2025-09-05 08:57:09.202863", "step": 1025, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:09.370832", "step": 1025, "epoch": 1 }, { "type": "loss", "content": 0.2132500410079956, "timestamp": "2025-09-05 08:57:09.373056", "step": 1026, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:09.569420", "step": 1026, "epoch": 1 }, { "type": "loss", "content": 0.4008413553237915, "timestamp": "2025-09-05 08:57:09.571230", "step": 1027, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:09.775965", "step": 1027, "epoch": 1 }, { "type": "loss", "content": 0.40069666504859924, "timestamp": "2025-09-05 08:57:09.785979", "step": 1028, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:09.951269", "step": 1028, "epoch": 1 }, { "type": "loss", "content": 0.4267215132713318, "timestamp": "2025-09-05 08:57:09.953022", "step": 1029, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:10.158013", "step": 1029, "epoch": 1 }, { "type": "loss", "content": 0.38750991225242615, "timestamp": "2025-09-05 08:57:10.159677", "step": 1030, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:10.352728", "step": 1030, "epoch": 1 }, { "type": "loss", "content": 0.26181745529174805, "timestamp": "2025-09-05 08:57:10.354524", "step": 1031, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:10.560112", "step": 1031, "epoch": 1 }, { "type": "loss", "content": 0.27084633708000183, "timestamp": "2025-09-05 08:57:10.576594", "step": 1032, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:10.772045", "step": 1032, "epoch": 1 }, { "type": "loss", "content": 0.29004645347595215, "timestamp": "2025-09-05 08:57:10.774068", "step": 1033, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:10.940422", "step": 1033, "epoch": 1 }, { "type": "loss", "content": 0.4643933176994324, "timestamp": "2025-09-05 08:57:10.942597", "step": 1034, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:57:11.150075", "step": 1034, "epoch": 1 }, { "type": "loss", "content": 0.42606353759765625, "timestamp": "2025-09-05 08:57:11.151790", "step": 1035, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:11.358130", "step": 1035, "epoch": 1 }, { "type": "loss", "content": 0.5070635676383972, "timestamp": "2025-09-05 08:57:11.375711", "step": 1036, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:11.572391", "step": 1036, "epoch": 1 }, { "type": "loss", "content": 0.3080455958843231, "timestamp": "2025-09-05 08:57:11.574129", "step": 1037, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:11.778556", "step": 1037, "epoch": 1 }, { "type": "loss", "content": 0.25634804368019104, "timestamp": "2025-09-05 08:57:11.780617", "step": 1038, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:57:11.987608", "step": 1038, "epoch": 1 }, { "type": "loss", "content": 0.3523538410663605, "timestamp": "2025-09-05 08:57:11.989435", "step": 1039, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:12.186193", "step": 1039, "epoch": 1 }, { "type": "loss", "content": 0.353440523147583, "timestamp": "2025-09-05 08:57:12.195768", "step": 1040, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:57:16.862221", "step": 1040, "epoch": 1 }, { "type": "pplx", "content": 56.865375266262824, "timestamp": "2025-09-05 08:57:16.865926", "step": 1040, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1040", "timestamp": "2025-09-05 08:57:17.336759", "step": 1040, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:17.507192", "step": 1040, "epoch": 1 }, { "type": "loss", "content": 0.4089427888393402, "timestamp": "2025-09-05 08:57:17.510045", "step": 1041, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:17.715791", "step": 1041, "epoch": 1 }, { "type": "loss", "content": 0.35010460019111633, "timestamp": "2025-09-05 08:57:17.718881", "step": 1042, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:17.914889", "step": 1042, "epoch": 1 }, { "type": "loss", "content": 0.2957332134246826, "timestamp": "2025-09-05 08:57:17.917349", "step": 1043, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:18.114054", "step": 1043, "epoch": 1 }, { "type": "loss", "content": 0.25546780228614807, "timestamp": "2025-09-05 08:57:18.128508", "step": 1044, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:18.329998", "step": 1044, "epoch": 1 }, { "type": "loss", "content": 0.29871493577957153, "timestamp": "2025-09-05 08:57:18.332151", "step": 1045, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:18.501032", "step": 1045, "epoch": 1 }, { "type": "loss", "content": 0.3960125744342804, "timestamp": "2025-09-05 08:57:18.503962", "step": 1046, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:18.711095", "step": 1046, "epoch": 1 }, { "type": "loss", "content": 0.2644537091255188, "timestamp": "2025-09-05 08:57:18.713113", "step": 1047, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:18.910983", "step": 1047, "epoch": 1 }, { "type": "loss", "content": 0.30242469906806946, "timestamp": "2025-09-05 08:57:18.920490", "step": 1048, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:19.084703", "step": 1048, "epoch": 1 }, { "type": "loss", "content": 0.43861666321754456, "timestamp": "2025-09-05 08:57:19.087028", "step": 1049, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:19.256016", "step": 1049, "epoch": 1 }, { "type": "loss", "content": 0.26393255591392517, "timestamp": "2025-09-05 08:57:19.258205", "step": 1050, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:19.464084", "step": 1050, "epoch": 1 }, { "type": "loss", "content": 0.29509156942367554, "timestamp": "2025-09-05 08:57:19.466567", "step": 1051, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:19.637608", "step": 1051, "epoch": 1 }, { "type": "loss", "content": 0.26441511511802673, "timestamp": "2025-09-05 08:57:19.654587", "step": 1052, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:19.852491", "step": 1052, "epoch": 1 }, { "type": "loss", "content": 0.3607224225997925, "timestamp": "2025-09-05 08:57:19.854411", "step": 1053, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:20.026434", "step": 1053, "epoch": 1 }, { "type": "loss", "content": 0.41068440675735474, "timestamp": "2025-09-05 08:57:20.028325", "step": 1054, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:20.238848", "step": 1054, "epoch": 1 }, { "type": "loss", "content": 0.3460855484008789, "timestamp": "2025-09-05 08:57:20.241070", "step": 1055, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:20.438914", "step": 1055, "epoch": 1 }, { "type": "loss", "content": 0.22883664071559906, "timestamp": "2025-09-05 08:57:20.455318", "step": 1056, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:20.652501", "step": 1056, "epoch": 1 }, { "type": "loss", "content": 0.3002649247646332, "timestamp": "2025-09-05 08:57:20.654137", "step": 1057, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:20.859648", "step": 1057, "epoch": 1 }, { "type": "loss", "content": 0.3611856698989868, "timestamp": "2025-09-05 08:57:20.862321", "step": 1058, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:21.030886", "step": 1058, "epoch": 1 }, { "type": "loss", "content": 0.34930041432380676, "timestamp": "2025-09-05 08:57:21.033038", "step": 1059, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:21.241166", "step": 1059, "epoch": 1 }, { "type": "loss", "content": 0.2977052628993988, "timestamp": "2025-09-05 08:57:21.255758", "step": 1060, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:57:25.941683", "step": 1060, "epoch": 1 }, { "type": "pplx", "content": 58.08633734713871, "timestamp": "2025-09-05 08:57:25.943919", "step": 1060, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:57:26.106655", "step": 1060, "epoch": 1 }, { "type": "loss", "content": 0.3529953360557556, "timestamp": "2025-09-05 08:57:26.109260", "step": 1061, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:26.276963", "step": 1061, "epoch": 1 }, { "type": "loss", "content": 0.25426262617111206, "timestamp": "2025-09-05 08:57:26.278757", "step": 1062, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:26.484665", "step": 1062, "epoch": 1 }, { "type": "loss", "content": 0.39096131920814514, "timestamp": "2025-09-05 08:57:26.486460", "step": 1063, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:26.682998", "step": 1063, "epoch": 1 }, { "type": "loss", "content": 0.39098745584487915, "timestamp": "2025-09-05 08:57:26.697339", "step": 1064, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:26.885072", "step": 1064, "epoch": 1 }, { "type": "loss", "content": 0.39723387360572815, "timestamp": "2025-09-05 08:57:26.887006", "step": 1065, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:27.083172", "step": 1065, "epoch": 1 }, { "type": "loss", "content": 0.3086792528629303, "timestamp": "2025-09-05 08:57:27.085530", "step": 1066, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:27.283705", "step": 1066, "epoch": 1 }, { "type": "loss", "content": 0.3772234320640564, "timestamp": "2025-09-05 08:57:27.285628", "step": 1067, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:27.483230", "step": 1067, "epoch": 1 }, { "type": "loss", "content": 0.44487428665161133, "timestamp": "2025-09-05 08:57:27.497543", "step": 1068, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:57:27.686727", "step": 1068, "epoch": 1 }, { "type": "loss", "content": 0.2712862193584442, "timestamp": "2025-09-05 08:57:27.688749", "step": 1069, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:27.899909", "step": 1069, "epoch": 1 }, { "type": "loss", "content": 0.2736116647720337, "timestamp": "2025-09-05 08:57:27.902023", "step": 1070, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:28.104397", "step": 1070, "epoch": 1 }, { "type": "loss", "content": 0.2640036344528198, "timestamp": "2025-09-05 08:57:28.106341", "step": 1071, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:57:28.301834", "step": 1071, "epoch": 1 }, { "type": "loss", "content": 0.25708210468292236, "timestamp": "2025-09-05 08:57:28.316390", "step": 1072, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:28.502823", "step": 1072, "epoch": 1 }, { "type": "loss", "content": 0.4192257523536682, "timestamp": "2025-09-05 08:57:28.504604", "step": 1073, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:28.702247", "step": 1073, "epoch": 1 }, { "type": "loss", "content": 0.5188899040222168, "timestamp": "2025-09-05 08:57:28.704344", "step": 1074, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:28.910313", "step": 1074, "epoch": 1 }, { "type": "loss", "content": 0.47707605361938477, "timestamp": "2025-09-05 08:57:28.912539", "step": 1075, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:29.123376", "step": 1075, "epoch": 1 }, { "type": "loss", "content": 0.2549794614315033, "timestamp": "2025-09-05 08:57:29.140734", "step": 1076, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:29.340155", "step": 1076, "epoch": 1 }, { "type": "loss", "content": 0.32116273045539856, "timestamp": "2025-09-05 08:57:29.342027", "step": 1077, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:29.537300", "step": 1077, "epoch": 1 }, { "type": "loss", "content": 0.2507237195968628, "timestamp": "2025-09-05 08:57:29.539351", "step": 1078, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:29.744422", "step": 1078, "epoch": 1 }, { "type": "loss", "content": 0.30762243270874023, "timestamp": "2025-09-05 08:57:29.746343", "step": 1079, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:29.943339", "step": 1079, "epoch": 1 }, { "type": "loss", "content": 0.3563655614852905, "timestamp": "2025-09-05 08:57:29.960859", "step": 1080, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:57:34.630014", "step": 1080, "epoch": 1 }, { "type": "pplx", "content": 57.13828478413463, "timestamp": "2025-09-05 08:57:34.632225", "step": 1080, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1080", "timestamp": "2025-09-05 08:57:35.114220", "step": 1080, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:35.297356", "step": 1080, "epoch": 1 }, { "type": "loss", "content": 0.3543899655342102, "timestamp": "2025-09-05 08:57:35.299618", "step": 1081, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:35.505794", "step": 1081, "epoch": 1 }, { "type": "loss", "content": 0.37011322379112244, "timestamp": "2025-09-05 08:57:35.507707", "step": 1082, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:35.705175", "step": 1082, "epoch": 1 }, { "type": "loss", "content": 0.29471492767333984, "timestamp": "2025-09-05 08:57:35.706975", "step": 1083, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:35.903105", "step": 1083, "epoch": 1 }, { "type": "loss", "content": 0.40031829476356506, "timestamp": "2025-09-05 08:57:35.917669", "step": 1084, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:36.107208", "step": 1084, "epoch": 1 }, { "type": "loss", "content": 0.25579631328582764, "timestamp": "2025-09-05 08:57:36.109233", "step": 1085, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:36.314564", "step": 1085, "epoch": 1 }, { "type": "loss", "content": 0.30125442147254944, "timestamp": "2025-09-05 08:57:36.316525", "step": 1086, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:36.522862", "step": 1086, "epoch": 1 }, { "type": "loss", "content": 0.3710455000400543, "timestamp": "2025-09-05 08:57:36.525296", "step": 1087, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:36.722872", "step": 1087, "epoch": 1 }, { "type": "loss", "content": 0.38871708512306213, "timestamp": "2025-09-05 08:57:36.736903", "step": 1088, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:36.925159", "step": 1088, "epoch": 1 }, { "type": "loss", "content": 0.4635475277900696, "timestamp": "2025-09-05 08:57:36.926826", "step": 1089, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:57:37.121132", "step": 1089, "epoch": 1 }, { "type": "loss", "content": 0.3786855936050415, "timestamp": "2025-09-05 08:57:37.123066", "step": 1090, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:37.320185", "step": 1090, "epoch": 1 }, { "type": "loss", "content": 0.29588383436203003, "timestamp": "2025-09-05 08:57:37.321973", "step": 1091, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:37.516900", "step": 1091, "epoch": 1 }, { "type": "loss", "content": 0.2814580500125885, "timestamp": "2025-09-05 08:57:37.531031", "step": 1092, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:37.719213", "step": 1092, "epoch": 1 }, { "type": "loss", "content": 0.29636815190315247, "timestamp": "2025-09-05 08:57:37.721048", "step": 1093, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:57:37.888769", "step": 1093, "epoch": 1 }, { "type": "loss", "content": 0.30050742626190186, "timestamp": "2025-09-05 08:57:37.890910", "step": 1094, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:38.095014", "step": 1094, "epoch": 1 }, { "type": "loss", "content": 0.21762610971927643, "timestamp": "2025-09-05 08:57:38.096704", "step": 1095, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:38.293599", "step": 1095, "epoch": 1 }, { "type": "loss", "content": 0.36448773741722107, "timestamp": "2025-09-05 08:57:38.308108", "step": 1096, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:38.495244", "step": 1096, "epoch": 1 }, { "type": "loss", "content": 0.2675822079181671, "timestamp": "2025-09-05 08:57:38.497086", "step": 1097, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:38.703191", "step": 1097, "epoch": 1 }, { "type": "loss", "content": 0.3438616693019867, "timestamp": "2025-09-05 08:57:38.705356", "step": 1098, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:38.901089", "step": 1098, "epoch": 1 }, { "type": "loss", "content": 0.25544899702072144, "timestamp": "2025-09-05 08:57:38.903018", "step": 1099, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:39.109292", "step": 1099, "epoch": 1 }, { "type": "loss", "content": 0.15850801765918732, "timestamp": "2025-09-05 08:57:39.123875", "step": 1100, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:57:43.756122", "step": 1100, "epoch": 1 }, { "type": "pplx", "content": 55.68282147255737, "timestamp": "2025-09-05 08:57:43.758533", "step": 1100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:43.921152", "step": 1100, "epoch": 1 }, { "type": "loss", "content": 0.4078519344329834, "timestamp": "2025-09-05 08:57:43.923717", "step": 1101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:44.130660", "step": 1101, "epoch": 1 }, { "type": "loss", "content": 0.31081441044807434, "timestamp": "2025-09-05 08:57:44.132606", "step": 1102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:44.340156", "step": 1102, "epoch": 1 }, { "type": "loss", "content": 0.14278367161750793, "timestamp": "2025-09-05 08:57:44.342528", "step": 1103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:44.549416", "step": 1103, "epoch": 1 }, { "type": "loss", "content": 0.2088335156440735, "timestamp": "2025-09-05 08:57:44.563823", "step": 1104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:44.760932", "step": 1104, "epoch": 1 }, { "type": "loss", "content": 0.3414754569530487, "timestamp": "2025-09-05 08:57:44.762833", "step": 1105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:44.960267", "step": 1105, "epoch": 1 }, { "type": "loss", "content": 0.43541449308395386, "timestamp": "2025-09-05 08:57:44.962499", "step": 1106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:45.170538", "step": 1106, "epoch": 1 }, { "type": "loss", "content": 0.38098201155662537, "timestamp": "2025-09-05 08:57:45.173030", "step": 1107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 5440033091648.0 }, "timestamp": "2025-09-05 08:57:45.374443", "step": 1107, "epoch": 1 }, { "type": "loss", "content": 0.49568867683410645, "timestamp": "2025-09-05 08:57:45.391316", "step": 1108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:45.590356", "step": 1108, "epoch": 1 }, { "type": "loss", "content": 0.20845304429531097, "timestamp": "2025-09-05 08:57:45.592498", "step": 1109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:45.789652", "step": 1109, "epoch": 1 }, { "type": "loss", "content": 0.34582969546318054, "timestamp": "2025-09-05 08:57:45.791657", "step": 1110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:45.995088", "step": 1110, "epoch": 1 }, { "type": "loss", "content": 0.30466410517692566, "timestamp": "2025-09-05 08:57:45.997298", "step": 1111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:46.192699", "step": 1111, "epoch": 1 }, { "type": "loss", "content": 0.3194935917854309, "timestamp": "2025-09-05 08:57:46.209949", "step": 1112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:46.408493", "step": 1112, "epoch": 1 }, { "type": "loss", "content": 0.3247276246547699, "timestamp": "2025-09-05 08:57:46.410457", "step": 1113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:46.617098", "step": 1113, "epoch": 1 }, { "type": "loss", "content": 0.26578959822654724, "timestamp": "2025-09-05 08:57:46.618843", "step": 1114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:46.818714", "step": 1114, "epoch": 1 }, { "type": "loss", "content": 0.48078733682632446, "timestamp": "2025-09-05 08:57:46.820884", "step": 1115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:47.016092", "step": 1115, "epoch": 1 }, { "type": "loss", "content": 0.3435327410697937, "timestamp": "2025-09-05 08:57:47.030516", "step": 1116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:57:47.216523", "step": 1116, "epoch": 1 }, { "type": "loss", "content": 0.3160141408443451, "timestamp": "2025-09-05 08:57:47.218401", "step": 1117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:47.412594", "step": 1117, "epoch": 1 }, { "type": "loss", "content": 0.3424045145511627, "timestamp": "2025-09-05 08:57:47.414440", "step": 1118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:47.610879", "step": 1118, "epoch": 1 }, { "type": "loss", "content": 0.29838377237319946, "timestamp": "2025-09-05 08:57:47.612751", "step": 1119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:47.808963", "step": 1119, "epoch": 1 }, { "type": "loss", "content": 0.2777602970600128, "timestamp": "2025-09-05 08:57:47.823432", "step": 1120, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:57:52.452094", "step": 1120, "epoch": 1 }, { "type": "pplx", "content": 56.402618878722485, "timestamp": "2025-09-05 08:57:52.454888", "step": 1120, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1120", "timestamp": "2025-09-05 08:57:52.924333", "step": 1120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:57:53.084960", "step": 1120, "epoch": 1 }, { "type": "loss", "content": 0.2436535656452179, "timestamp": "2025-09-05 08:57:53.086827", "step": 1121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:53.289641", "step": 1121, "epoch": 1 }, { "type": "loss", "content": 0.3066195845603943, "timestamp": "2025-09-05 08:57:53.291300", "step": 1122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:53.499294", "step": 1122, "epoch": 1 }, { "type": "loss", "content": 0.210659921169281, "timestamp": "2025-09-05 08:57:53.501932", "step": 1123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:53.701662", "step": 1123, "epoch": 1 }, { "type": "loss", "content": 0.42288801074028015, "timestamp": "2025-09-05 08:57:53.718355", "step": 1124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:53.917047", "step": 1124, "epoch": 1 }, { "type": "loss", "content": 0.27715227007865906, "timestamp": "2025-09-05 08:57:53.918967", "step": 1125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:54.085170", "step": 1125, "epoch": 1 }, { "type": "loss", "content": 0.4282056987285614, "timestamp": "2025-09-05 08:57:54.087062", "step": 1126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:57:54.280878", "step": 1126, "epoch": 1 }, { "type": "loss", "content": 0.3095110356807709, "timestamp": "2025-09-05 08:57:54.283528", "step": 1127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:54.489385", "step": 1127, "epoch": 1 }, { "type": "loss", "content": 0.3560994267463684, "timestamp": "2025-09-05 08:57:54.504279", "step": 1128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:54.706093", "step": 1128, "epoch": 1 }, { "type": "loss", "content": 0.5486235618591309, "timestamp": "2025-09-05 08:57:54.708108", "step": 1129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:54.915209", "step": 1129, "epoch": 1 }, { "type": "loss", "content": 0.2991634011268616, "timestamp": "2025-09-05 08:57:54.917880", "step": 1130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:55.116971", "step": 1130, "epoch": 1 }, { "type": "loss", "content": 0.323395311832428, "timestamp": "2025-09-05 08:57:55.119750", "step": 1131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:57:55.315888", "step": 1131, "epoch": 1 }, { "type": "loss", "content": 0.2886749505996704, "timestamp": "2025-09-05 08:57:55.330392", "step": 1132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:55.529682", "step": 1132, "epoch": 1 }, { "type": "loss", "content": 0.3267287313938141, "timestamp": "2025-09-05 08:57:55.531485", "step": 1133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:55.728151", "step": 1133, "epoch": 1 }, { "type": "loss", "content": 0.38941022753715515, "timestamp": "2025-09-05 08:57:55.730008", "step": 1134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:57:55.924848", "step": 1134, "epoch": 1 }, { "type": "loss", "content": 0.3658711016178131, "timestamp": "2025-09-05 08:57:55.926503", "step": 1135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:56.093798", "step": 1135, "epoch": 1 }, { "type": "loss", "content": 0.34820908308029175, "timestamp": "2025-09-05 08:57:56.110544", "step": 1136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:56.308140", "step": 1136, "epoch": 1 }, { "type": "loss", "content": 0.3882652819156647, "timestamp": "2025-09-05 08:57:56.310266", "step": 1137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:57:56.479788", "step": 1137, "epoch": 1 }, { "type": "loss", "content": 0.17920145392417908, "timestamp": "2025-09-05 08:57:56.482460", "step": 1138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:57:56.689791", "step": 1138, "epoch": 1 }, { "type": "loss", "content": 0.21023604273796082, "timestamp": "2025-09-05 08:57:56.692566", "step": 1139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:57:56.889514", "step": 1139, "epoch": 1 }, { "type": "loss", "content": 0.28918132185935974, "timestamp": "2025-09-05 08:57:56.904085", "step": 1140, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:58:01.541928", "step": 1140, "epoch": 1 }, { "type": "pplx", "content": 56.62639246330814, "timestamp": "2025-09-05 08:58:01.543982", "step": 1140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:01.704731", "step": 1140, "epoch": 1 }, { "type": "loss", "content": 0.2688441276550293, "timestamp": "2025-09-05 08:58:01.707270", "step": 1141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:01.918528", "step": 1141, "epoch": 1 }, { "type": "loss", "content": 0.14376114308834076, "timestamp": "2025-09-05 08:58:01.923187", "step": 1142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:02.134322", "step": 1142, "epoch": 1 }, { "type": "loss", "content": 0.375431627035141, "timestamp": "2025-09-05 08:58:02.136938", "step": 1143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:02.351335", "step": 1143, "epoch": 1 }, { "type": "loss", "content": 0.3484734892845154, "timestamp": "2025-09-05 08:58:02.371162", "step": 1144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:02.572337", "step": 1144, "epoch": 1 }, { "type": "loss", "content": 0.3586747646331787, "timestamp": "2025-09-05 08:58:02.573999", "step": 1145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:02.768214", "step": 1145, "epoch": 1 }, { "type": "loss", "content": 0.339052677154541, "timestamp": "2025-09-05 08:58:02.770566", "step": 1146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:02.966732", "step": 1146, "epoch": 1 }, { "type": "loss", "content": 0.38937392830848694, "timestamp": "2025-09-05 08:58:02.969114", "step": 1147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:58:03.175468", "step": 1147, "epoch": 1 }, { "type": "loss", "content": 0.5443062782287598, "timestamp": "2025-09-05 08:58:03.191793", "step": 1148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:03.390206", "step": 1148, "epoch": 1 }, { "type": "loss", "content": 0.2671515643596649, "timestamp": "2025-09-05 08:58:03.392071", "step": 1149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:03.588804", "step": 1149, "epoch": 1 }, { "type": "loss", "content": 0.31323206424713135, "timestamp": "2025-09-05 08:58:03.591276", "step": 1150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:03.788661", "step": 1150, "epoch": 1 }, { "type": "loss", "content": 0.4581383764743805, "timestamp": "2025-09-05 08:58:03.790937", "step": 1151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:03.996509", "step": 1151, "epoch": 1 }, { "type": "loss", "content": 0.3642024099826813, "timestamp": "2025-09-05 08:58:04.011304", "step": 1152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:04.200325", "step": 1152, "epoch": 1 }, { "type": "loss", "content": 0.28851771354675293, "timestamp": "2025-09-05 08:58:04.202081", "step": 1153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:04.398012", "step": 1153, "epoch": 1 }, { "type": "loss", "content": 0.297490656375885, "timestamp": "2025-09-05 08:58:04.399844", "step": 1154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:04.595632", "step": 1154, "epoch": 1 }, { "type": "loss", "content": 0.3883640766143799, "timestamp": "2025-09-05 08:58:04.597484", "step": 1155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:04.793403", "step": 1155, "epoch": 1 }, { "type": "loss", "content": 0.4353066086769104, "timestamp": "2025-09-05 08:58:04.809872", "step": 1156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:05.007788", "step": 1156, "epoch": 1 }, { "type": "loss", "content": 0.41394755244255066, "timestamp": "2025-09-05 08:58:05.010408", "step": 1157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:05.210965", "step": 1157, "epoch": 1 }, { "type": "loss", "content": 0.4061930477619171, "timestamp": "2025-09-05 08:58:05.213045", "step": 1158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:05.379194", "step": 1158, "epoch": 1 }, { "type": "loss", "content": 0.39679089188575745, "timestamp": "2025-09-05 08:58:05.382513", "step": 1159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:58:05.589595", "step": 1159, "epoch": 1 }, { "type": "loss", "content": 0.26510128378868103, "timestamp": "2025-09-05 08:58:05.604655", "step": 1160, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:58:10.260868", "step": 1160, "epoch": 1 }, { "type": "pplx", "content": 55.69607325889674, "timestamp": "2025-09-05 08:58:10.262863", "step": 1160, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1160", "timestamp": "2025-09-05 08:58:10.713992", "step": 1160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:10.883875", "step": 1160, "epoch": 1 }, { "type": "loss", "content": 0.2949378490447998, "timestamp": "2025-09-05 08:58:10.885977", "step": 1161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:11.082355", "step": 1161, "epoch": 1 }, { "type": "loss", "content": 0.481171578168869, "timestamp": "2025-09-05 08:58:11.083958", "step": 1162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:11.280145", "step": 1162, "epoch": 1 }, { "type": "loss", "content": 0.3129526972770691, "timestamp": "2025-09-05 08:58:11.281934", "step": 1163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:11.476684", "step": 1163, "epoch": 1 }, { "type": "loss", "content": 0.3266424238681793, "timestamp": "2025-09-05 08:58:11.491572", "step": 1164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:11.679597", "step": 1164, "epoch": 1 }, { "type": "loss", "content": 0.23583292961120605, "timestamp": "2025-09-05 08:58:11.681817", "step": 1165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:11.882166", "step": 1165, "epoch": 1 }, { "type": "loss", "content": 0.5377989411354065, "timestamp": "2025-09-05 08:58:11.885749", "step": 1166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:12.056875", "step": 1166, "epoch": 1 }, { "type": "loss", "content": 0.3818263113498688, "timestamp": "2025-09-05 08:58:12.059180", "step": 1167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:12.265640", "step": 1167, "epoch": 1 }, { "type": "loss", "content": 0.41938114166259766, "timestamp": "2025-09-05 08:58:12.277483", "step": 1168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:12.447993", "step": 1168, "epoch": 1 }, { "type": "loss", "content": 0.17846006155014038, "timestamp": "2025-09-05 08:58:12.449643", "step": 1169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:12.656563", "step": 1169, "epoch": 1 }, { "type": "loss", "content": 0.3130786418914795, "timestamp": "2025-09-05 08:58:12.658864", "step": 1170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:12.856448", "step": 1170, "epoch": 1 }, { "type": "loss", "content": 0.24914269149303436, "timestamp": "2025-09-05 08:58:12.858087", "step": 1171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:13.053886", "step": 1171, "epoch": 1 }, { "type": "loss", "content": 0.3340952694416046, "timestamp": "2025-09-05 08:58:13.068458", "step": 1172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:13.258558", "step": 1172, "epoch": 1 }, { "type": "loss", "content": 0.3117923438549042, "timestamp": "2025-09-05 08:58:13.260298", "step": 1173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:13.428124", "step": 1173, "epoch": 1 }, { "type": "loss", "content": 0.36320504546165466, "timestamp": "2025-09-05 08:58:13.430056", "step": 1174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:13.597395", "step": 1174, "epoch": 1 }, { "type": "loss", "content": 0.3952508866786957, "timestamp": "2025-09-05 08:58:13.599309", "step": 1175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:13.809227", "step": 1175, "epoch": 1 }, { "type": "loss", "content": 0.3737662136554718, "timestamp": "2025-09-05 08:58:13.823689", "step": 1176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:14.012685", "step": 1176, "epoch": 1 }, { "type": "loss", "content": 0.4855545461177826, "timestamp": "2025-09-05 08:58:14.014560", "step": 1177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:14.210707", "step": 1177, "epoch": 1 }, { "type": "loss", "content": 0.489812970161438, "timestamp": "2025-09-05 08:58:14.212582", "step": 1178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:14.418706", "step": 1178, "epoch": 1 }, { "type": "loss", "content": 0.3980942666530609, "timestamp": "2025-09-05 08:58:14.423470", "step": 1179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:14.629308", "step": 1179, "epoch": 1 }, { "type": "loss", "content": 0.4134223163127899, "timestamp": "2025-09-05 08:58:14.645650", "step": 1180, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:58:19.351980", "step": 1180, "epoch": 1 }, { "type": "pplx", "content": 54.78913997248039, "timestamp": "2025-09-05 08:58:19.355359", "step": 1180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:19.524235", "step": 1180, "epoch": 1 }, { "type": "loss", "content": 0.3536244332790375, "timestamp": "2025-09-05 08:58:19.526582", "step": 1181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:19.725722", "step": 1181, "epoch": 1 }, { "type": "loss", "content": 0.2990207076072693, "timestamp": "2025-09-05 08:58:19.729850", "step": 1182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:19.933714", "step": 1182, "epoch": 1 }, { "type": "loss", "content": 0.2687399983406067, "timestamp": "2025-09-05 08:58:19.935632", "step": 1183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:58:20.139334", "step": 1183, "epoch": 1 }, { "type": "loss", "content": 0.3062213957309723, "timestamp": "2025-09-05 08:58:20.156178", "step": 1184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:20.353342", "step": 1184, "epoch": 1 }, { "type": "loss", "content": 0.3144480586051941, "timestamp": "2025-09-05 08:58:20.355192", "step": 1185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:20.552429", "step": 1185, "epoch": 1 }, { "type": "loss", "content": 0.34665945172309875, "timestamp": "2025-09-05 08:58:20.554396", "step": 1186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:20.762922", "step": 1186, "epoch": 1 }, { "type": "loss", "content": 0.2913441061973572, "timestamp": "2025-09-05 08:58:20.764934", "step": 1187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:20.961203", "step": 1187, "epoch": 1 }, { "type": "loss", "content": 0.27978482842445374, "timestamp": "2025-09-05 08:58:20.977949", "step": 1188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:21.176178", "step": 1188, "epoch": 1 }, { "type": "loss", "content": 0.3447984755039215, "timestamp": "2025-09-05 08:58:21.178182", "step": 1189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:21.385149", "step": 1189, "epoch": 1 }, { "type": "loss", "content": 0.27464714646339417, "timestamp": "2025-09-05 08:58:21.387222", "step": 1190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:21.596698", "step": 1190, "epoch": 1 }, { "type": "loss", "content": 0.31913846731185913, "timestamp": "2025-09-05 08:58:21.600384", "step": 1191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:21.804357", "step": 1191, "epoch": 1 }, { "type": "loss", "content": 0.3542693257331848, "timestamp": "2025-09-05 08:58:21.818909", "step": 1192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:22.006890", "step": 1192, "epoch": 1 }, { "type": "loss", "content": 0.36727648973464966, "timestamp": "2025-09-05 08:58:22.009547", "step": 1193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:22.203981", "step": 1193, "epoch": 1 }, { "type": "loss", "content": 0.5776286125183105, "timestamp": "2025-09-05 08:58:22.206814", "step": 1194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:58:22.400605", "step": 1194, "epoch": 1 }, { "type": "loss", "content": 0.2967214286327362, "timestamp": "2025-09-05 08:58:22.403373", "step": 1195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:22.596778", "step": 1195, "epoch": 1 }, { "type": "loss", "content": 0.3207745850086212, "timestamp": "2025-09-05 08:58:22.612062", "step": 1196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:22.807056", "step": 1196, "epoch": 1 }, { "type": "loss", "content": 0.299358606338501, "timestamp": "2025-09-05 08:58:22.809849", "step": 1197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:23.006195", "step": 1197, "epoch": 1 }, { "type": "loss", "content": 0.3517630994319916, "timestamp": "2025-09-05 08:58:23.008969", "step": 1198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:23.205294", "step": 1198, "epoch": 1 }, { "type": "loss", "content": 0.3210662305355072, "timestamp": "2025-09-05 08:58:23.208167", "step": 1199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:23.402800", "step": 1199, "epoch": 1 }, { "type": "loss", "content": 0.2987304925918579, "timestamp": "2025-09-05 08:58:23.420736", "step": 1200, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:58:28.151648", "step": 1200, "epoch": 1 }, { "type": "pplx", "content": 55.163801136680675, "timestamp": "2025-09-05 08:58:28.153281", "step": 1200, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1200", "timestamp": "2025-09-05 08:58:28.623598", "step": 1200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:28.796735", "step": 1200, "epoch": 1 }, { "type": "loss", "content": 0.37314802408218384, "timestamp": "2025-09-05 08:58:28.800241", "step": 1201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:29.007449", "step": 1201, "epoch": 1 }, { "type": "loss", "content": 0.46742531657218933, "timestamp": "2025-09-05 08:58:29.009503", "step": 1202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:29.221368", "step": 1202, "epoch": 1 }, { "type": "loss", "content": 0.3984720706939697, "timestamp": "2025-09-05 08:58:29.223451", "step": 1203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:29.435123", "step": 1203, "epoch": 1 }, { "type": "loss", "content": 0.32529622316360474, "timestamp": "2025-09-05 08:58:29.449351", "step": 1204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:29.648833", "step": 1204, "epoch": 1 }, { "type": "loss", "content": 0.5382220149040222, "timestamp": "2025-09-05 08:58:29.650921", "step": 1205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:29.849552", "step": 1205, "epoch": 1 }, { "type": "loss", "content": 0.38562560081481934, "timestamp": "2025-09-05 08:58:29.851589", "step": 1206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:30.060179", "step": 1206, "epoch": 1 }, { "type": "loss", "content": 0.2525237798690796, "timestamp": "2025-09-05 08:58:30.062452", "step": 1207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:30.261557", "step": 1207, "epoch": 1 }, { "type": "loss", "content": 0.3691144287586212, "timestamp": "2025-09-05 08:58:30.279697", "step": 1208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:30.480112", "step": 1208, "epoch": 1 }, { "type": "loss", "content": 0.28269970417022705, "timestamp": "2025-09-05 08:58:30.483548", "step": 1209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:30.688759", "step": 1209, "epoch": 1 }, { "type": "loss", "content": 0.3319048583507538, "timestamp": "2025-09-05 08:58:30.691126", "step": 1210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:30.888760", "step": 1210, "epoch": 1 }, { "type": "loss", "content": 0.3252932131290436, "timestamp": "2025-09-05 08:58:30.890442", "step": 1211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:58:31.098575", "step": 1211, "epoch": 1 }, { "type": "loss", "content": 0.4231226444244385, "timestamp": "2025-09-05 08:58:31.115403", "step": 1212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:31.305184", "step": 1212, "epoch": 1 }, { "type": "loss", "content": 0.28163203597068787, "timestamp": "2025-09-05 08:58:31.308680", "step": 1213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:31.515231", "step": 1213, "epoch": 1 }, { "type": "loss", "content": 0.3782075345516205, "timestamp": "2025-09-05 08:58:31.517092", "step": 1214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:31.714360", "step": 1214, "epoch": 1 }, { "type": "loss", "content": 0.30757156014442444, "timestamp": "2025-09-05 08:58:31.716274", "step": 1215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:31.912180", "step": 1215, "epoch": 1 }, { "type": "loss", "content": 0.46240562200546265, "timestamp": "2025-09-05 08:58:31.926460", "step": 1216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:32.116240", "step": 1216, "epoch": 1 }, { "type": "loss", "content": 0.275336891412735, "timestamp": "2025-09-05 08:58:32.118015", "step": 1217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:32.314740", "step": 1217, "epoch": 1 }, { "type": "loss", "content": 0.43169447779655457, "timestamp": "2025-09-05 08:58:32.316516", "step": 1218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:32.523346", "step": 1218, "epoch": 1 }, { "type": "loss", "content": 0.4832044541835785, "timestamp": "2025-09-05 08:58:32.525237", "step": 1219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:32.726761", "step": 1219, "epoch": 1 }, { "type": "loss", "content": 0.3741658627986908, "timestamp": "2025-09-05 08:58:32.743245", "step": 1220, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:58:37.465340", "step": 1220, "epoch": 1 }, { "type": "pplx", "content": 55.636826007322504, "timestamp": "2025-09-05 08:58:37.467376", "step": 1220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:37.629395", "step": 1220, "epoch": 1 }, { "type": "loss", "content": 0.2922039031982422, "timestamp": "2025-09-05 08:58:37.633868", "step": 1221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:37.801769", "step": 1221, "epoch": 1 }, { "type": "loss", "content": 0.31432512402534485, "timestamp": "2025-09-05 08:58:37.806881", "step": 1222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:38.020020", "step": 1222, "epoch": 1 }, { "type": "loss", "content": 0.273971289396286, "timestamp": "2025-09-05 08:58:38.021964", "step": 1223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:38.217738", "step": 1223, "epoch": 1 }, { "type": "loss", "content": 0.2825872302055359, "timestamp": "2025-09-05 08:58:38.234390", "step": 1224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:38.430446", "step": 1224, "epoch": 1 }, { "type": "loss", "content": 0.3784426152706146, "timestamp": "2025-09-05 08:58:38.432294", "step": 1225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:38.638082", "step": 1225, "epoch": 1 }, { "type": "loss", "content": 0.36815857887268066, "timestamp": "2025-09-05 08:58:38.639863", "step": 1226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:38.846971", "step": 1226, "epoch": 1 }, { "type": "loss", "content": 0.3271258771419525, "timestamp": "2025-09-05 08:58:38.848960", "step": 1227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 08:58:39.045679", "step": 1227, "epoch": 1 }, { "type": "loss", "content": 0.2493131011724472, "timestamp": "2025-09-05 08:58:39.060206", "step": 1228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:39.255473", "step": 1228, "epoch": 1 }, { "type": "loss", "content": 0.2988832890987396, "timestamp": "2025-09-05 08:58:39.257757", "step": 1229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:39.472160", "step": 1229, "epoch": 1 }, { "type": "loss", "content": 0.44903451204299927, "timestamp": "2025-09-05 08:58:39.473964", "step": 1230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:39.671824", "step": 1230, "epoch": 1 }, { "type": "loss", "content": 0.5419812798500061, "timestamp": "2025-09-05 08:58:39.674087", "step": 1231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:39.882692", "step": 1231, "epoch": 1 }, { "type": "loss", "content": 0.3951758146286011, "timestamp": "2025-09-05 08:58:39.897357", "step": 1232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:40.086411", "step": 1232, "epoch": 1 }, { "type": "loss", "content": 0.24042554199695587, "timestamp": "2025-09-05 08:58:40.088327", "step": 1233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:40.294687", "step": 1233, "epoch": 1 }, { "type": "loss", "content": 0.4239327013492584, "timestamp": "2025-09-05 08:58:40.296994", "step": 1234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:40.497105", "step": 1234, "epoch": 1 }, { "type": "loss", "content": 0.19099119305610657, "timestamp": "2025-09-05 08:58:40.499457", "step": 1235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:40.706219", "step": 1235, "epoch": 1 }, { "type": "loss", "content": 0.30486881732940674, "timestamp": "2025-09-05 08:58:40.722373", "step": 1236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:40.920116", "step": 1236, "epoch": 1 }, { "type": "loss", "content": 0.24796439707279205, "timestamp": "2025-09-05 08:58:40.921901", "step": 1237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:41.118829", "step": 1237, "epoch": 1 }, { "type": "loss", "content": 0.2669127881526947, "timestamp": "2025-09-05 08:58:41.120812", "step": 1238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:41.327218", "step": 1238, "epoch": 1 }, { "type": "loss", "content": 0.20141269266605377, "timestamp": "2025-09-05 08:58:41.329910", "step": 1239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:41.530544", "step": 1239, "epoch": 1 }, { "type": "loss", "content": 0.4013859033584595, "timestamp": "2025-09-05 08:58:41.547398", "step": 1240, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:58:46.337252", "step": 1240, "epoch": 1 }, { "type": "pplx", "content": 55.81478220379367, "timestamp": "2025-09-05 08:58:46.339141", "step": 1240, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1240", "timestamp": "2025-09-05 08:58:47.010044", "step": 1240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:47.179545", "step": 1240, "epoch": 1 }, { "type": "loss", "content": 0.3496423661708832, "timestamp": "2025-09-05 08:58:47.181523", "step": 1241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:47.391265", "step": 1241, "epoch": 1 }, { "type": "loss", "content": 0.17773263156414032, "timestamp": "2025-09-05 08:58:47.393443", "step": 1242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:47.591587", "step": 1242, "epoch": 1 }, { "type": "loss", "content": 0.3922744393348694, "timestamp": "2025-09-05 08:58:47.593807", "step": 1243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:47.792392", "step": 1243, "epoch": 1 }, { "type": "loss", "content": 0.39606019854545593, "timestamp": "2025-09-05 08:58:47.808993", "step": 1244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:48.005991", "step": 1244, "epoch": 1 }, { "type": "loss", "content": 0.28507742285728455, "timestamp": "2025-09-05 08:58:48.007751", "step": 1245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:48.208527", "step": 1245, "epoch": 1 }, { "type": "loss", "content": 0.38115379214286804, "timestamp": "2025-09-05 08:58:48.210544", "step": 1246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:58:48.420393", "step": 1246, "epoch": 1 }, { "type": "loss", "content": 0.38982245326042175, "timestamp": "2025-09-05 08:58:48.423008", "step": 1247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:48.621835", "step": 1247, "epoch": 1 }, { "type": "loss", "content": 0.3070641756057739, "timestamp": "2025-09-05 08:58:48.639605", "step": 1248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:48.835488", "step": 1248, "epoch": 1 }, { "type": "loss", "content": 0.29290473461151123, "timestamp": "2025-09-05 08:58:48.837083", "step": 1249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:49.044009", "step": 1249, "epoch": 1 }, { "type": "loss", "content": 0.33328184485435486, "timestamp": "2025-09-05 08:58:49.045921", "step": 1250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:58:49.243517", "step": 1250, "epoch": 1 }, { "type": "loss", "content": 0.482103168964386, "timestamp": "2025-09-05 08:58:49.245731", "step": 1251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:49.451807", "step": 1251, "epoch": 1 }, { "type": "loss", "content": 0.4451736807823181, "timestamp": "2025-09-05 08:58:49.465976", "step": 1252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:49.664436", "step": 1252, "epoch": 1 }, { "type": "loss", "content": 0.2903027832508087, "timestamp": "2025-09-05 08:58:49.666700", "step": 1253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:49.873073", "step": 1253, "epoch": 1 }, { "type": "loss", "content": 0.2942162752151489, "timestamp": "2025-09-05 08:58:49.875001", "step": 1254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:50.072634", "step": 1254, "epoch": 1 }, { "type": "loss", "content": 0.4005998969078064, "timestamp": "2025-09-05 08:58:50.074375", "step": 1255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:50.273482", "step": 1255, "epoch": 1 }, { "type": "loss", "content": 0.3666303753852844, "timestamp": "2025-09-05 08:58:50.290366", "step": 1256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:50.486596", "step": 1256, "epoch": 1 }, { "type": "loss", "content": 0.44452735781669617, "timestamp": "2025-09-05 08:58:50.488547", "step": 1257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:58:50.689502", "step": 1257, "epoch": 1 }, { "type": "loss", "content": 0.3432859480381012, "timestamp": "2025-09-05 08:58:50.691337", "step": 1258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:50.895348", "step": 1258, "epoch": 1 }, { "type": "loss", "content": 0.44465234875679016, "timestamp": "2025-09-05 08:58:50.899575", "step": 1259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:51.107777", "step": 1259, "epoch": 1 }, { "type": "loss", "content": 0.43338504433631897, "timestamp": "2025-09-05 08:58:51.124714", "step": 1260, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:58:55.873912", "step": 1260, "epoch": 1 }, { "type": "pplx", "content": 56.86786214445365, "timestamp": "2025-09-05 08:58:55.877053", "step": 1260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:56.038358", "step": 1260, "epoch": 1 }, { "type": "loss", "content": 0.2701398730278015, "timestamp": "2025-09-05 08:58:56.040599", "step": 1261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:56.246168", "step": 1261, "epoch": 1 }, { "type": "loss", "content": 0.319100558757782, "timestamp": "2025-09-05 08:58:56.248207", "step": 1262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:56.445175", "step": 1262, "epoch": 1 }, { "type": "loss", "content": 0.3955155313014984, "timestamp": "2025-09-05 08:58:56.447043", "step": 1263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:56.653005", "step": 1263, "epoch": 1 }, { "type": "loss", "content": 0.28663066029548645, "timestamp": "2025-09-05 08:58:56.672510", "step": 1264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:56.882587", "step": 1264, "epoch": 1 }, { "type": "loss", "content": 0.2910723090171814, "timestamp": "2025-09-05 08:58:56.884373", "step": 1265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:58:57.081041", "step": 1265, "epoch": 1 }, { "type": "loss", "content": 0.48940280079841614, "timestamp": "2025-09-05 08:58:57.083007", "step": 1266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:57.278680", "step": 1266, "epoch": 1 }, { "type": "loss", "content": 0.25540128350257874, "timestamp": "2025-09-05 08:58:57.280828", "step": 1267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:57.478468", "step": 1267, "epoch": 1 }, { "type": "loss", "content": 0.4109537899494171, "timestamp": "2025-09-05 08:58:57.492922", "step": 1268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:57.682652", "step": 1268, "epoch": 1 }, { "type": "loss", "content": 0.3179936408996582, "timestamp": "2025-09-05 08:58:57.685638", "step": 1269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:57.893860", "step": 1269, "epoch": 1 }, { "type": "loss", "content": 0.33500999212265015, "timestamp": "2025-09-05 08:58:57.896864", "step": 1270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:58.105312", "step": 1270, "epoch": 1 }, { "type": "loss", "content": 0.316933810710907, "timestamp": "2025-09-05 08:58:58.107330", "step": 1271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:58:58.303861", "step": 1271, "epoch": 1 }, { "type": "loss", "content": 0.3351965844631195, "timestamp": "2025-09-05 08:58:58.326530", "step": 1272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:58.515213", "step": 1272, "epoch": 1 }, { "type": "loss", "content": 0.42428067326545715, "timestamp": "2025-09-05 08:58:58.517405", "step": 1273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:58.713550", "step": 1273, "epoch": 1 }, { "type": "loss", "content": 0.24712416529655457, "timestamp": "2025-09-05 08:58:58.716283", "step": 1274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:58:58.916484", "step": 1274, "epoch": 1 }, { "type": "loss", "content": 0.25089097023010254, "timestamp": "2025-09-05 08:58:58.921477", "step": 1275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:58:59.132790", "step": 1275, "epoch": 1 }, { "type": "loss", "content": 0.3202970325946808, "timestamp": "2025-09-05 08:58:59.148507", "step": 1276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 08:58:59.342869", "step": 1276, "epoch": 1 }, { "type": "loss", "content": 0.27855849266052246, "timestamp": "2025-09-05 08:58:59.344795", "step": 1277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:58:59.543276", "step": 1277, "epoch": 1 }, { "type": "loss", "content": 0.40407755970954895, "timestamp": "2025-09-05 08:58:59.545205", "step": 1278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:59.749926", "step": 1278, "epoch": 1 }, { "type": "loss", "content": 0.1615646630525589, "timestamp": "2025-09-05 08:58:59.759180", "step": 1279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:58:59.964874", "step": 1279, "epoch": 1 }, { "type": "loss", "content": 0.3803251385688782, "timestamp": "2025-09-05 08:58:59.981869", "step": 1280, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:59:04.747412", "step": 1280, "epoch": 1 }, { "type": "pplx", "content": 57.685687212488666, "timestamp": "2025-09-05 08:59:04.749354", "step": 1280, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1280", "timestamp": "2025-09-05 08:59:05.213901", "step": 1280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:05.376627", "step": 1280, "epoch": 1 }, { "type": "loss", "content": 0.3631496727466583, "timestamp": "2025-09-05 08:59:05.378976", "step": 1281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:05.579929", "step": 1281, "epoch": 1 }, { "type": "loss", "content": 0.3576178252696991, "timestamp": "2025-09-05 08:59:05.581880", "step": 1282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:05.778238", "step": 1282, "epoch": 1 }, { "type": "loss", "content": 0.33735018968582153, "timestamp": "2025-09-05 08:59:05.781707", "step": 1283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:05.980579", "step": 1283, "epoch": 1 }, { "type": "loss", "content": 0.4075542986392975, "timestamp": "2025-09-05 08:59:05.997002", "step": 1284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:06.192702", "step": 1284, "epoch": 1 }, { "type": "loss", "content": 0.2630583643913269, "timestamp": "2025-09-05 08:59:06.194210", "step": 1285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:59:06.400762", "step": 1285, "epoch": 1 }, { "type": "loss", "content": 0.46530115604400635, "timestamp": "2025-09-05 08:59:06.402542", "step": 1286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:06.608888", "step": 1286, "epoch": 1 }, { "type": "loss", "content": 0.3795335292816162, "timestamp": "2025-09-05 08:59:06.610712", "step": 1287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 08:59:06.828383", "step": 1287, "epoch": 1 }, { "type": "loss", "content": 0.43324583768844604, "timestamp": "2025-09-05 08:59:06.842845", "step": 1288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:07.031687", "step": 1288, "epoch": 1 }, { "type": "loss", "content": 0.2171895056962967, "timestamp": "2025-09-05 08:59:07.033567", "step": 1289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:07.229948", "step": 1289, "epoch": 1 }, { "type": "loss", "content": 0.25000834465026855, "timestamp": "2025-09-05 08:59:07.231777", "step": 1290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:07.427444", "step": 1290, "epoch": 1 }, { "type": "loss", "content": 0.2618256211280823, "timestamp": "2025-09-05 08:59:07.429500", "step": 1291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:07.635328", "step": 1291, "epoch": 1 }, { "type": "loss", "content": 0.395792156457901, "timestamp": "2025-09-05 08:59:07.652426", "step": 1292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:07.849809", "step": 1292, "epoch": 1 }, { "type": "loss", "content": 0.3665156066417694, "timestamp": "2025-09-05 08:59:07.852069", "step": 1293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:08.048771", "step": 1293, "epoch": 1 }, { "type": "loss", "content": 0.38869205117225647, "timestamp": "2025-09-05 08:59:08.050665", "step": 1294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:08.217638", "step": 1294, "epoch": 1 }, { "type": "loss", "content": 0.27352771162986755, "timestamp": "2025-09-05 08:59:08.222770", "step": 1295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:08.441962", "step": 1295, "epoch": 1 }, { "type": "loss", "content": 0.3475234806537628, "timestamp": "2025-09-05 08:59:08.456264", "step": 1296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:08.643690", "step": 1296, "epoch": 1 }, { "type": "loss", "content": 0.4448797106742859, "timestamp": "2025-09-05 08:59:08.645368", "step": 1297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:08.839667", "step": 1297, "epoch": 1 }, { "type": "loss", "content": 0.27712783217430115, "timestamp": "2025-09-05 08:59:08.841451", "step": 1298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:09.046283", "step": 1298, "epoch": 1 }, { "type": "loss", "content": 0.2694713771343231, "timestamp": "2025-09-05 08:59:09.048220", "step": 1299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:09.242540", "step": 1299, "epoch": 1 }, { "type": "loss", "content": 0.194560706615448, "timestamp": "2025-09-05 08:59:09.257384", "step": 1300, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:59:13.911796", "step": 1300, "epoch": 1 }, { "type": "pplx", "content": 57.60386166296047, "timestamp": "2025-09-05 08:59:13.915390", "step": 1300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:14.080096", "step": 1300, "epoch": 2 }, { "type": "loss", "content": 0.3144618272781372, "timestamp": "2025-09-05 08:59:14.082040", "step": 1301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:14.252081", "step": 1301, "epoch": 2 }, { "type": "loss", "content": 0.3946076035499573, "timestamp": "2025-09-05 08:59:14.254065", "step": 1302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:14.458505", "step": 1302, "epoch": 2 }, { "type": "loss", "content": 0.2931336760520935, "timestamp": "2025-09-05 08:59:14.460293", "step": 1303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:14.667262", "step": 1303, "epoch": 2 }, { "type": "loss", "content": 0.4380492568016052, "timestamp": "2025-09-05 08:59:14.684802", "step": 1304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:14.883125", "step": 1304, "epoch": 2 }, { "type": "loss", "content": 0.26544374227523804, "timestamp": "2025-09-05 08:59:14.885058", "step": 1305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:15.090269", "step": 1305, "epoch": 2 }, { "type": "loss", "content": 0.363553911447525, "timestamp": "2025-09-05 08:59:15.091915", "step": 1306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:15.287843", "step": 1306, "epoch": 2 }, { "type": "loss", "content": 0.3278323709964752, "timestamp": "2025-09-05 08:59:15.289648", "step": 1307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:15.495949", "step": 1307, "epoch": 2 }, { "type": "loss", "content": 0.49476808309555054, "timestamp": "2025-09-05 08:59:15.505766", "step": 1308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:15.669479", "step": 1308, "epoch": 2 }, { "type": "loss", "content": 0.4298350214958191, "timestamp": "2025-09-05 08:59:15.671595", "step": 1309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:15.876381", "step": 1309, "epoch": 2 }, { "type": "loss", "content": 0.41758593916893005, "timestamp": "2025-09-05 08:59:15.878324", "step": 1310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:16.074306", "step": 1310, "epoch": 2 }, { "type": "loss", "content": 0.43739357590675354, "timestamp": "2025-09-05 08:59:16.076299", "step": 1311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:16.282501", "step": 1311, "epoch": 2 }, { "type": "loss", "content": 0.44980770349502563, "timestamp": "2025-09-05 08:59:16.300777", "step": 1312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:16.492159", "step": 1312, "epoch": 2 }, { "type": "loss", "content": 0.3103547692298889, "timestamp": "2025-09-05 08:59:16.494221", "step": 1313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:16.701249", "step": 1313, "epoch": 2 }, { "type": "loss", "content": 0.42775505781173706, "timestamp": "2025-09-05 08:59:16.703236", "step": 1314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:16.910343", "step": 1314, "epoch": 2 }, { "type": "loss", "content": 0.2993074059486389, "timestamp": "2025-09-05 08:59:16.913368", "step": 1315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:17.108812", "step": 1315, "epoch": 2 }, { "type": "loss", "content": 0.3599098324775696, "timestamp": "2025-09-05 08:59:17.123094", "step": 1316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:17.312042", "step": 1316, "epoch": 2 }, { "type": "loss", "content": 0.3977486491203308, "timestamp": "2025-09-05 08:59:17.313889", "step": 1317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:17.510084", "step": 1317, "epoch": 2 }, { "type": "loss", "content": 0.3357473611831665, "timestamp": "2025-09-05 08:59:17.512160", "step": 1318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:17.709325", "step": 1318, "epoch": 2 }, { "type": "loss", "content": 0.309467077255249, "timestamp": "2025-09-05 08:59:17.711246", "step": 1319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:17.908970", "step": 1319, "epoch": 2 }, { "type": "loss", "content": 0.29499152302742004, "timestamp": "2025-09-05 08:59:17.923049", "step": 1320, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:59:22.624294", "step": 1320, "epoch": 2 }, { "type": "pplx", "content": 57.36183768004392, "timestamp": "2025-09-05 08:59:22.626617", "step": 1320, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1320", "timestamp": "2025-09-05 08:59:23.087989", "step": 1320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:23.269735", "step": 1320, "epoch": 2 }, { "type": "loss", "content": 0.44477683305740356, "timestamp": "2025-09-05 08:59:23.272494", "step": 1321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:23.440651", "step": 1321, "epoch": 2 }, { "type": "loss", "content": 0.46572038531303406, "timestamp": "2025-09-05 08:59:23.443718", "step": 1322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:23.650943", "step": 1322, "epoch": 2 }, { "type": "loss", "content": 0.27906832098960876, "timestamp": "2025-09-05 08:59:23.653252", "step": 1323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:23.865736", "step": 1323, "epoch": 2 }, { "type": "loss", "content": 0.4312252104282379, "timestamp": "2025-09-05 08:59:23.880583", "step": 1324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:24.071014", "step": 1324, "epoch": 2 }, { "type": "loss", "content": 0.30487996339797974, "timestamp": "2025-09-05 08:59:24.074126", "step": 1325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:24.244295", "step": 1325, "epoch": 2 }, { "type": "loss", "content": 0.3292028307914734, "timestamp": "2025-09-05 08:59:24.247257", "step": 1326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:24.452913", "step": 1326, "epoch": 2 }, { "type": "loss", "content": 0.3268643915653229, "timestamp": "2025-09-05 08:59:24.455149", "step": 1327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:24.651837", "step": 1327, "epoch": 2 }, { "type": "loss", "content": 0.39009836316108704, "timestamp": "2025-09-05 08:59:24.666828", "step": 1328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:24.856475", "step": 1328, "epoch": 2 }, { "type": "loss", "content": 0.33519843220710754, "timestamp": "2025-09-05 08:59:24.858850", "step": 1329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:25.063589", "step": 1329, "epoch": 2 }, { "type": "loss", "content": 0.28642693161964417, "timestamp": "2025-09-05 08:59:25.065551", "step": 1330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:25.278910", "step": 1330, "epoch": 2 }, { "type": "loss", "content": 0.44241082668304443, "timestamp": "2025-09-05 08:59:25.280946", "step": 1331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:25.479903", "step": 1331, "epoch": 2 }, { "type": "loss", "content": 0.33670666813850403, "timestamp": "2025-09-05 08:59:25.494610", "step": 1332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:25.683317", "step": 1332, "epoch": 2 }, { "type": "loss", "content": 0.2926885485649109, "timestamp": "2025-09-05 08:59:25.685308", "step": 1333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:25.891586", "step": 1333, "epoch": 2 }, { "type": "loss", "content": 0.3921295404434204, "timestamp": "2025-09-05 08:59:25.894116", "step": 1334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:26.100092", "step": 1334, "epoch": 2 }, { "type": "loss", "content": 0.37950482964515686, "timestamp": "2025-09-05 08:59:26.102003", "step": 1335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:26.299905", "step": 1335, "epoch": 2 }, { "type": "loss", "content": 0.4223680794239044, "timestamp": "2025-09-05 08:59:26.314409", "step": 1336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:26.503230", "step": 1336, "epoch": 2 }, { "type": "loss", "content": 0.4278903603553772, "timestamp": "2025-09-05 08:59:26.505314", "step": 1337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:26.712369", "step": 1337, "epoch": 2 }, { "type": "loss", "content": 0.3812592923641205, "timestamp": "2025-09-05 08:59:26.714473", "step": 1338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:26.921242", "step": 1338, "epoch": 2 }, { "type": "loss", "content": 0.3434469997882843, "timestamp": "2025-09-05 08:59:26.925060", "step": 1339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:27.124683", "step": 1339, "epoch": 2 }, { "type": "loss", "content": 0.2850334346294403, "timestamp": "2025-09-05 08:59:27.139548", "step": 1340, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:59:31.890740", "step": 1340, "epoch": 2 }, { "type": "pplx", "content": 56.91573223377922, "timestamp": "2025-09-05 08:59:31.893487", "step": 1340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:32.055369", "step": 1340, "epoch": 2 }, { "type": "loss", "content": 0.3462655246257782, "timestamp": "2025-09-05 08:59:32.057646", "step": 1341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:32.262517", "step": 1341, "epoch": 2 }, { "type": "loss", "content": 0.346880704164505, "timestamp": "2025-09-05 08:59:32.264638", "step": 1342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:32.463118", "step": 1342, "epoch": 2 }, { "type": "loss", "content": 0.3966115415096283, "timestamp": "2025-09-05 08:59:32.465165", "step": 1343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:32.671225", "step": 1343, "epoch": 2 }, { "type": "loss", "content": 0.33604347705841064, "timestamp": "2025-09-05 08:59:32.688689", "step": 1344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:32.885583", "step": 1344, "epoch": 2 }, { "type": "loss", "content": 0.40588828921318054, "timestamp": "2025-09-05 08:59:32.887860", "step": 1345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:33.085720", "step": 1345, "epoch": 2 }, { "type": "loss", "content": 0.24824795126914978, "timestamp": "2025-09-05 08:59:33.089596", "step": 1346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:33.306741", "step": 1346, "epoch": 2 }, { "type": "loss", "content": 0.37973007559776306, "timestamp": "2025-09-05 08:59:33.309131", "step": 1347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:33.517134", "step": 1347, "epoch": 2 }, { "type": "loss", "content": 0.46875327825546265, "timestamp": "2025-09-05 08:59:33.534015", "step": 1348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:33.737292", "step": 1348, "epoch": 2 }, { "type": "loss", "content": 0.315308541059494, "timestamp": "2025-09-05 08:59:33.739605", "step": 1349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:33.939497", "step": 1349, "epoch": 2 }, { "type": "loss", "content": 0.40450769662857056, "timestamp": "2025-09-05 08:59:33.941817", "step": 1350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:34.148841", "step": 1350, "epoch": 2 }, { "type": "loss", "content": 0.33537474274635315, "timestamp": "2025-09-05 08:59:34.151279", "step": 1351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:34.353877", "step": 1351, "epoch": 2 }, { "type": "loss", "content": 0.4013102948665619, "timestamp": "2025-09-05 08:59:34.373704", "step": 1352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:34.571837", "step": 1352, "epoch": 2 }, { "type": "loss", "content": 0.37566959857940674, "timestamp": "2025-09-05 08:59:34.574167", "step": 1353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:34.772021", "step": 1353, "epoch": 2 }, { "type": "loss", "content": 0.2328030914068222, "timestamp": "2025-09-05 08:59:34.774460", "step": 1354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:34.982169", "step": 1354, "epoch": 2 }, { "type": "loss", "content": 0.40998750925064087, "timestamp": "2025-09-05 08:59:34.984549", "step": 1355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:35.198027", "step": 1355, "epoch": 2 }, { "type": "loss", "content": 0.3128526210784912, "timestamp": "2025-09-05 08:59:35.214422", "step": 1356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:35.405238", "step": 1356, "epoch": 2 }, { "type": "loss", "content": 0.26632222533226013, "timestamp": "2025-09-05 08:59:35.407784", "step": 1357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:35.615592", "step": 1357, "epoch": 2 }, { "type": "loss", "content": 0.4280310273170471, "timestamp": "2025-09-05 08:59:35.617878", "step": 1358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:35.831407", "step": 1358, "epoch": 2 }, { "type": "loss", "content": 0.3956908881664276, "timestamp": "2025-09-05 08:59:35.833787", "step": 1359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:36.041582", "step": 1359, "epoch": 2 }, { "type": "loss", "content": 0.3153320550918579, "timestamp": "2025-09-05 08:59:36.056665", "step": 1360, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:59:40.819925", "step": 1360, "epoch": 2 }, { "type": "pplx", "content": 56.70963319815915, "timestamp": "2025-09-05 08:59:40.823587", "step": 1360, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1360", "timestamp": "2025-09-05 08:59:41.503796", "step": 1360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:41.682641", "step": 1360, "epoch": 2 }, { "type": "loss", "content": 0.28595757484436035, "timestamp": "2025-09-05 08:59:41.685944", "step": 1361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:41.882882", "step": 1361, "epoch": 2 }, { "type": "loss", "content": 0.24493901431560516, "timestamp": "2025-09-05 08:59:41.885326", "step": 1362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:42.085395", "step": 1362, "epoch": 2 }, { "type": "loss", "content": 0.5457180142402649, "timestamp": "2025-09-05 08:59:42.088843", "step": 1363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:42.297550", "step": 1363, "epoch": 2 }, { "type": "loss", "content": 0.4040033221244812, "timestamp": "2025-09-05 08:59:42.311680", "step": 1364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:42.500508", "step": 1364, "epoch": 2 }, { "type": "loss", "content": 0.45766469836235046, "timestamp": "2025-09-05 08:59:42.504774", "step": 1365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:42.715436", "step": 1365, "epoch": 2 }, { "type": "loss", "content": 0.3269166648387909, "timestamp": "2025-09-05 08:59:42.717414", "step": 1366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:42.928316", "step": 1366, "epoch": 2 }, { "type": "loss", "content": 0.4435346722602844, "timestamp": "2025-09-05 08:59:42.930765", "step": 1367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:43.139410", "step": 1367, "epoch": 2 }, { "type": "loss", "content": 0.30959320068359375, "timestamp": "2025-09-05 08:59:43.153945", "step": 1368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:43.344550", "step": 1368, "epoch": 2 }, { "type": "loss", "content": 0.3181923031806946, "timestamp": "2025-09-05 08:59:43.348085", "step": 1369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:43.550510", "step": 1369, "epoch": 2 }, { "type": "loss", "content": 0.3809313476085663, "timestamp": "2025-09-05 08:59:43.554965", "step": 1370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:43.764296", "step": 1370, "epoch": 2 }, { "type": "loss", "content": 0.3070775270462036, "timestamp": "2025-09-05 08:59:43.765849", "step": 1371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:43.970865", "step": 1371, "epoch": 2 }, { "type": "loss", "content": 0.3441022038459778, "timestamp": "2025-09-05 08:59:43.987786", "step": 1372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:44.196406", "step": 1372, "epoch": 2 }, { "type": "loss", "content": 0.48999282717704773, "timestamp": "2025-09-05 08:59:44.199384", "step": 1373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:59:44.400537", "step": 1373, "epoch": 2 }, { "type": "loss", "content": 0.3837975561618805, "timestamp": "2025-09-05 08:59:44.402764", "step": 1374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:44.602234", "step": 1374, "epoch": 2 }, { "type": "loss", "content": 0.29959526658058167, "timestamp": "2025-09-05 08:59:44.606416", "step": 1375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:44.820077", "step": 1375, "epoch": 2 }, { "type": "loss", "content": 0.3829365372657776, "timestamp": "2025-09-05 08:59:44.836407", "step": 1376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:45.029202", "step": 1376, "epoch": 2 }, { "type": "loss", "content": 0.3905230164527893, "timestamp": "2025-09-05 08:59:45.031717", "step": 1377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:45.228189", "step": 1377, "epoch": 2 }, { "type": "loss", "content": 0.2604524493217468, "timestamp": "2025-09-05 08:59:45.230920", "step": 1378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:45.401870", "step": 1378, "epoch": 2 }, { "type": "loss", "content": 0.2648540437221527, "timestamp": "2025-09-05 08:59:45.403891", "step": 1379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:45.610398", "step": 1379, "epoch": 2 }, { "type": "loss", "content": 0.25715941190719604, "timestamp": "2025-09-05 08:59:45.625180", "step": 1380, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 08:59:50.612485", "step": 1380, "epoch": 2 }, { "type": "pplx", "content": 56.62276512044233, "timestamp": "2025-09-05 08:59:50.616571", "step": 1380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:50.791068", "step": 1380, "epoch": 2 }, { "type": "loss", "content": 0.39979779720306396, "timestamp": "2025-09-05 08:59:50.792966", "step": 1381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:50.997964", "step": 1381, "epoch": 2 }, { "type": "loss", "content": 0.3293604254722595, "timestamp": "2025-09-05 08:59:51.001515", "step": 1382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 08:59:51.214343", "step": 1382, "epoch": 2 }, { "type": "loss", "content": 0.4426141679286957, "timestamp": "2025-09-05 08:59:51.216786", "step": 1383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:51.477127", "step": 1383, "epoch": 2 }, { "type": "loss", "content": 0.4094974994659424, "timestamp": "2025-09-05 08:59:51.492213", "step": 1384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:51.692197", "step": 1384, "epoch": 2 }, { "type": "loss", "content": 0.3006475865840912, "timestamp": "2025-09-05 08:59:51.694616", "step": 1385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:51.893460", "step": 1385, "epoch": 2 }, { "type": "loss", "content": 0.3527871370315552, "timestamp": "2025-09-05 08:59:51.895681", "step": 1386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:52.108314", "step": 1386, "epoch": 2 }, { "type": "loss", "content": 0.4060435891151428, "timestamp": "2025-09-05 08:59:52.113186", "step": 1387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:59:52.315169", "step": 1387, "epoch": 2 }, { "type": "loss", "content": 0.31812533736228943, "timestamp": "2025-09-05 08:59:52.329965", "step": 1388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 08:59:52.523118", "step": 1388, "epoch": 2 }, { "type": "loss", "content": 0.4235638976097107, "timestamp": "2025-09-05 08:59:52.525559", "step": 1389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:52.765252", "step": 1389, "epoch": 2 }, { "type": "loss", "content": 0.3670898377895355, "timestamp": "2025-09-05 08:59:52.768592", "step": 1390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:52.967645", "step": 1390, "epoch": 2 }, { "type": "loss", "content": 0.3166564404964447, "timestamp": "2025-09-05 08:59:52.969921", "step": 1391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:53.171302", "step": 1391, "epoch": 2 }, { "type": "loss", "content": 0.3685086965560913, "timestamp": "2025-09-05 08:59:53.189842", "step": 1392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:53.392408", "step": 1392, "epoch": 2 }, { "type": "loss", "content": 0.24257375299930573, "timestamp": "2025-09-05 08:59:53.394129", "step": 1393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:53.602965", "step": 1393, "epoch": 2 }, { "type": "loss", "content": 0.4437284767627716, "timestamp": "2025-09-05 08:59:53.605553", "step": 1394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 08:59:53.885427", "step": 1394, "epoch": 2 }, { "type": "loss", "content": 0.44088515639305115, "timestamp": "2025-09-05 08:59:53.888345", "step": 1395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 08:59:54.104505", "step": 1395, "epoch": 2 }, { "type": "loss", "content": 0.47625863552093506, "timestamp": "2025-09-05 08:59:54.123048", "step": 1396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 08:59:54.327570", "step": 1396, "epoch": 2 }, { "type": "loss", "content": 0.46532103419303894, "timestamp": "2025-09-05 08:59:54.330291", "step": 1397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:54.532148", "step": 1397, "epoch": 2 }, { "type": "loss", "content": 0.32851770520210266, "timestamp": "2025-09-05 08:59:54.534121", "step": 1398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:54.827163", "step": 1398, "epoch": 2 }, { "type": "loss", "content": 0.5156394243240356, "timestamp": "2025-09-05 08:59:54.829390", "step": 1399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 08:59:55.038664", "step": 1399, "epoch": 2 }, { "type": "loss", "content": 0.33277690410614014, "timestamp": "2025-09-05 08:59:55.054002", "step": 1400, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:00:00.092698", "step": 1400, "epoch": 2 }, { "type": "pplx", "content": 56.81417784437169, "timestamp": "2025-09-05 09:00:00.095367", "step": 1400, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1400", "timestamp": "2025-09-05 09:00:00.596593", "step": 1400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:00.791451", "step": 1400, "epoch": 2 }, { "type": "loss", "content": 0.25804686546325684, "timestamp": "2025-09-05 09:00:00.794003", "step": 1401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:00:01.002420", "step": 1401, "epoch": 2 }, { "type": "loss", "content": 0.31888291239738464, "timestamp": "2025-09-05 09:00:01.005189", "step": 1402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:01.208357", "step": 1402, "epoch": 2 }, { "type": "loss", "content": 0.36028096079826355, "timestamp": "2025-09-05 09:00:01.210255", "step": 1403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:01.414257", "step": 1403, "epoch": 2 }, { "type": "loss", "content": 0.2646711766719818, "timestamp": "2025-09-05 09:00:01.428985", "step": 1404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:01.625056", "step": 1404, "epoch": 2 }, { "type": "loss", "content": 0.3558668792247772, "timestamp": "2025-09-05 09:00:01.627352", "step": 1405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:01.824547", "step": 1405, "epoch": 2 }, { "type": "loss", "content": 0.3586845099925995, "timestamp": "2025-09-05 09:00:01.827510", "step": 1406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:02.029391", "step": 1406, "epoch": 2 }, { "type": "loss", "content": 0.20658056437969208, "timestamp": "2025-09-05 09:00:02.032797", "step": 1407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:02.243681", "step": 1407, "epoch": 2 }, { "type": "loss", "content": 0.3639076352119446, "timestamp": "2025-09-05 09:00:02.260804", "step": 1408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:00:02.462268", "step": 1408, "epoch": 2 }, { "type": "loss", "content": 0.26506686210632324, "timestamp": "2025-09-05 09:00:02.465508", "step": 1409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:02.664380", "step": 1409, "epoch": 2 }, { "type": "loss", "content": 0.25214824080467224, "timestamp": "2025-09-05 09:00:02.667383", "step": 1410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:02.876346", "step": 1410, "epoch": 2 }, { "type": "loss", "content": 0.31711870431900024, "timestamp": "2025-09-05 09:00:02.878119", "step": 1411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:03.088023", "step": 1411, "epoch": 2 }, { "type": "loss", "content": 0.5284944772720337, "timestamp": "2025-09-05 09:00:03.102500", "step": 1412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:03.300795", "step": 1412, "epoch": 2 }, { "type": "loss", "content": 0.3170612156391144, "timestamp": "2025-09-05 09:00:03.302474", "step": 1413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:03.511578", "step": 1413, "epoch": 2 }, { "type": "loss", "content": 0.19468443095684052, "timestamp": "2025-09-05 09:00:03.513839", "step": 1414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:03.767100", "step": 1414, "epoch": 2 }, { "type": "loss", "content": 0.26165586709976196, "timestamp": "2025-09-05 09:00:03.811286", "step": 1415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:04.059572", "step": 1415, "epoch": 2 }, { "type": "loss", "content": 0.32013124227523804, "timestamp": "2025-09-05 09:00:04.069440", "step": 1416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:04.236919", "step": 1416, "epoch": 2 }, { "type": "loss", "content": 0.36106082797050476, "timestamp": "2025-09-05 09:00:04.240225", "step": 1417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:04.455188", "step": 1417, "epoch": 2 }, { "type": "loss", "content": 0.3524361252784729, "timestamp": "2025-09-05 09:00:04.458212", "step": 1418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:04.725938", "step": 1418, "epoch": 2 }, { "type": "loss", "content": 0.25635817646980286, "timestamp": "2025-09-05 09:00:04.728692", "step": 1419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:04.941135", "step": 1419, "epoch": 2 }, { "type": "loss", "content": 0.24682959914207458, "timestamp": "2025-09-05 09:00:04.957692", "step": 1420, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:00:10.139186", "step": 1420, "epoch": 2 }, { "type": "pplx", "content": 57.352158708318434, "timestamp": "2025-09-05 09:00:10.142827", "step": 1420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:10.304786", "step": 1420, "epoch": 2 }, { "type": "loss", "content": 0.40001311898231506, "timestamp": "2025-09-05 09:00:10.308705", "step": 1421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:10.507444", "step": 1421, "epoch": 2 }, { "type": "loss", "content": 0.30492401123046875, "timestamp": "2025-09-05 09:00:10.536910", "step": 1422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:10.790334", "step": 1422, "epoch": 2 }, { "type": "loss", "content": 0.3323606252670288, "timestamp": "2025-09-05 09:00:10.792575", "step": 1423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:11.006202", "step": 1423, "epoch": 2 }, { "type": "loss", "content": 0.23769192397594452, "timestamp": "2025-09-05 09:00:11.021212", "step": 1424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:11.234231", "step": 1424, "epoch": 2 }, { "type": "loss", "content": 0.3142143189907074, "timestamp": "2025-09-05 09:00:11.237890", "step": 1425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:11.531231", "step": 1425, "epoch": 2 }, { "type": "loss", "content": 0.3281928300857544, "timestamp": "2025-09-05 09:00:11.532868", "step": 1426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:11.742869", "step": 1426, "epoch": 2 }, { "type": "loss", "content": 0.4385247528553009, "timestamp": "2025-09-05 09:00:11.744405", "step": 1427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:11.943393", "step": 1427, "epoch": 2 }, { "type": "loss", "content": 0.3019809126853943, "timestamp": "2025-09-05 09:00:12.000833", "step": 1428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:12.245974", "step": 1428, "epoch": 2 }, { "type": "loss", "content": 0.374027818441391, "timestamp": "2025-09-05 09:00:12.247971", "step": 1429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:12.445020", "step": 1429, "epoch": 2 }, { "type": "loss", "content": 0.2926400601863861, "timestamp": "2025-09-05 09:00:12.446711", "step": 1430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:12.616677", "step": 1430, "epoch": 2 }, { "type": "loss", "content": 0.2943369150161743, "timestamp": "2025-09-05 09:00:12.618732", "step": 1431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:12.828550", "step": 1431, "epoch": 2 }, { "type": "loss", "content": 0.25276631116867065, "timestamp": "2025-09-05 09:00:12.838911", "step": 1432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:00:13.004531", "step": 1432, "epoch": 2 }, { "type": "loss", "content": 0.47622016072273254, "timestamp": "2025-09-05 09:00:13.006782", "step": 1433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:13.213452", "step": 1433, "epoch": 2 }, { "type": "loss", "content": 0.3429078161716461, "timestamp": "2025-09-05 09:00:13.215745", "step": 1434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:13.423244", "step": 1434, "epoch": 2 }, { "type": "loss", "content": 0.2541375756263733, "timestamp": "2025-09-05 09:00:13.425629", "step": 1435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:13.622643", "step": 1435, "epoch": 2 }, { "type": "loss", "content": 0.42097604274749756, "timestamp": "2025-09-05 09:00:13.638488", "step": 1436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:13.834396", "step": 1436, "epoch": 2 }, { "type": "loss", "content": 0.29442155361175537, "timestamp": "2025-09-05 09:00:13.838219", "step": 1437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:14.051866", "step": 1437, "epoch": 2 }, { "type": "loss", "content": 0.32426849007606506, "timestamp": "2025-09-05 09:00:14.054791", "step": 1438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:14.267023", "step": 1438, "epoch": 2 }, { "type": "loss", "content": 0.3755996525287628, "timestamp": "2025-09-05 09:00:14.269523", "step": 1439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:14.476738", "step": 1439, "epoch": 2 }, { "type": "loss", "content": 0.2896839678287506, "timestamp": "2025-09-05 09:00:14.492129", "step": 1440, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:00:19.555529", "step": 1440, "epoch": 2 }, { "type": "pplx", "content": 57.297274431293886, "timestamp": "2025-09-05 09:00:19.558449", "step": 1440, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1440", "timestamp": "2025-09-05 09:00:20.192961", "step": 1440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:20.379456", "step": 1440, "epoch": 2 }, { "type": "loss", "content": 0.3088245391845703, "timestamp": "2025-09-05 09:00:20.381264", "step": 1441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:20.582334", "step": 1441, "epoch": 2 }, { "type": "loss", "content": 0.40866950154304504, "timestamp": "2025-09-05 09:00:20.584803", "step": 1442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:20.783709", "step": 1442, "epoch": 2 }, { "type": "loss", "content": 0.40102720260620117, "timestamp": "2025-09-05 09:00:20.787143", "step": 1443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:00:20.985647", "step": 1443, "epoch": 2 }, { "type": "loss", "content": 0.29152411222457886, "timestamp": "2025-09-05 09:00:21.001648", "step": 1444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:21.248024", "step": 1444, "epoch": 2 }, { "type": "loss", "content": 0.35733139514923096, "timestamp": "2025-09-05 09:00:21.250634", "step": 1445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:21.560840", "step": 1445, "epoch": 2 }, { "type": "loss", "content": 0.2699089050292969, "timestamp": "2025-09-05 09:00:21.563859", "step": 1446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:21.762868", "step": 1446, "epoch": 2 }, { "type": "loss", "content": 0.39894458651542664, "timestamp": "2025-09-05 09:00:21.765328", "step": 1447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:21.973633", "step": 1447, "epoch": 2 }, { "type": "loss", "content": 0.31270933151245117, "timestamp": "2025-09-05 09:00:21.988487", "step": 1448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:22.181427", "step": 1448, "epoch": 2 }, { "type": "loss", "content": 0.468645304441452, "timestamp": "2025-09-05 09:00:22.183887", "step": 1449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:22.382002", "step": 1449, "epoch": 2 }, { "type": "loss", "content": 0.2627173066139221, "timestamp": "2025-09-05 09:00:22.384074", "step": 1450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:22.590432", "step": 1450, "epoch": 2 }, { "type": "loss", "content": 0.21120664477348328, "timestamp": "2025-09-05 09:00:22.593672", "step": 1451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:22.845576", "step": 1451, "epoch": 2 }, { "type": "loss", "content": 0.37008100748062134, "timestamp": "2025-09-05 09:00:22.860746", "step": 1452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:23.057719", "step": 1452, "epoch": 2 }, { "type": "loss", "content": 0.2987282872200012, "timestamp": "2025-09-05 09:00:23.060153", "step": 1453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:23.328956", "step": 1453, "epoch": 2 }, { "type": "loss", "content": 0.4526394307613373, "timestamp": "2025-09-05 09:00:23.330758", "step": 1454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:23.532425", "step": 1454, "epoch": 2 }, { "type": "loss", "content": 0.4313774108886719, "timestamp": "2025-09-05 09:00:23.535905", "step": 1455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:23.738330", "step": 1455, "epoch": 2 }, { "type": "loss", "content": 0.3727917969226837, "timestamp": "2025-09-05 09:00:23.754096", "step": 1456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:23.956198", "step": 1456, "epoch": 2 }, { "type": "loss", "content": 0.3371935486793518, "timestamp": "2025-09-05 09:00:23.958709", "step": 1457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:24.156841", "step": 1457, "epoch": 2 }, { "type": "loss", "content": 0.29328054189682007, "timestamp": "2025-09-05 09:00:24.158646", "step": 1458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:24.387752", "step": 1458, "epoch": 2 }, { "type": "loss", "content": 0.4609324336051941, "timestamp": "2025-09-05 09:00:24.389694", "step": 1459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:24.586114", "step": 1459, "epoch": 2 }, { "type": "loss", "content": 0.3523360788822174, "timestamp": "2025-09-05 09:00:24.600107", "step": 1460, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:00:29.592360", "step": 1460, "epoch": 2 }, { "type": "pplx", "content": 57.27989146436504, "timestamp": "2025-09-05 09:00:29.594078", "step": 1460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:29.757891", "step": 1460, "epoch": 2 }, { "type": "loss", "content": 0.272165983915329, "timestamp": "2025-09-05 09:00:29.760268", "step": 1461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:29.965527", "step": 1461, "epoch": 2 }, { "type": "loss", "content": 0.4126870334148407, "timestamp": "2025-09-05 09:00:29.967387", "step": 1462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:00:30.219610", "step": 1462, "epoch": 2 }, { "type": "loss", "content": 0.3365689218044281, "timestamp": "2025-09-05 09:00:30.221690", "step": 1463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:30.419896", "step": 1463, "epoch": 2 }, { "type": "loss", "content": 0.3105847239494324, "timestamp": "2025-09-05 09:00:30.434605", "step": 1464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:30.624678", "step": 1464, "epoch": 2 }, { "type": "loss", "content": 0.3041870594024658, "timestamp": "2025-09-05 09:00:30.669491", "step": 1465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:30.908522", "step": 1465, "epoch": 2 }, { "type": "loss", "content": 0.21601124107837677, "timestamp": "2025-09-05 09:00:30.951473", "step": 1466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:31.201263", "step": 1466, "epoch": 2 }, { "type": "loss", "content": 0.329571008682251, "timestamp": "2025-09-05 09:00:31.203271", "step": 1467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:31.454663", "step": 1467, "epoch": 2 }, { "type": "loss", "content": 0.3269669711589813, "timestamp": "2025-09-05 09:00:31.469803", "step": 1468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:31.664022", "step": 1468, "epoch": 2 }, { "type": "loss", "content": 0.45828115940093994, "timestamp": "2025-09-05 09:00:31.666165", "step": 1469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:31.905612", "step": 1469, "epoch": 2 }, { "type": "loss", "content": 0.3074597716331482, "timestamp": "2025-09-05 09:00:31.907252", "step": 1470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:32.113377", "step": 1470, "epoch": 2 }, { "type": "loss", "content": 0.45386582612991333, "timestamp": "2025-09-05 09:00:32.115039", "step": 1471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:32.323220", "step": 1471, "epoch": 2 }, { "type": "loss", "content": 0.31208494305610657, "timestamp": "2025-09-05 09:00:32.340131", "step": 1472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:32.539778", "step": 1472, "epoch": 2 }, { "type": "loss", "content": 0.30297181010246277, "timestamp": "2025-09-05 09:00:32.546254", "step": 1473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:32.754227", "step": 1473, "epoch": 2 }, { "type": "loss", "content": 0.24466463923454285, "timestamp": "2025-09-05 09:00:32.755965", "step": 1474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:33.051143", "step": 1474, "epoch": 2 }, { "type": "loss", "content": 0.22954264283180237, "timestamp": "2025-09-05 09:00:33.052843", "step": 1475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:33.249729", "step": 1475, "epoch": 2 }, { "type": "loss", "content": 0.2577112317085266, "timestamp": "2025-09-05 09:00:33.266023", "step": 1476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:33.454539", "step": 1476, "epoch": 2 }, { "type": "loss", "content": 0.4258563220500946, "timestamp": "2025-09-05 09:00:33.496351", "step": 1477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:33.792317", "step": 1477, "epoch": 2 }, { "type": "loss", "content": 0.28466764092445374, "timestamp": "2025-09-05 09:00:33.794271", "step": 1478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:34.046935", "step": 1478, "epoch": 2 }, { "type": "loss", "content": 0.3980422914028168, "timestamp": "2025-09-05 09:00:34.093014", "step": 1479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:34.304162", "step": 1479, "epoch": 2 }, { "type": "loss", "content": 0.36734020709991455, "timestamp": "2025-09-05 09:00:34.321087", "step": 1480, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:00:39.490206", "step": 1480, "epoch": 2 }, { "type": "pplx", "content": 57.14511724528793, "timestamp": "2025-09-05 09:00:39.492052", "step": 1480, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1480", "timestamp": "2025-09-05 09:00:39.944046", "step": 1480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:00:40.106662", "step": 1480, "epoch": 2 }, { "type": "loss", "content": 0.2696632444858551, "timestamp": "2025-09-05 09:00:40.108366", "step": 1481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:40.306117", "step": 1481, "epoch": 2 }, { "type": "loss", "content": 0.2808653712272644, "timestamp": "2025-09-05 09:00:40.308173", "step": 1482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:40.519058", "step": 1482, "epoch": 2 }, { "type": "loss", "content": 0.43718430399894714, "timestamp": "2025-09-05 09:00:40.521238", "step": 1483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:40.729677", "step": 1483, "epoch": 2 }, { "type": "loss", "content": 0.28544357419013977, "timestamp": "2025-09-05 09:00:40.746202", "step": 1484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:40.945290", "step": 1484, "epoch": 2 }, { "type": "loss", "content": 0.18251703679561615, "timestamp": "2025-09-05 09:00:40.947259", "step": 1485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:41.155284", "step": 1485, "epoch": 2 }, { "type": "loss", "content": 0.38339364528656006, "timestamp": "2025-09-05 09:00:41.197437", "step": 1486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:41.450497", "step": 1486, "epoch": 2 }, { "type": "loss", "content": 0.34182044863700867, "timestamp": "2025-09-05 09:00:41.494088", "step": 1487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:41.694560", "step": 1487, "epoch": 2 }, { "type": "loss", "content": 0.27841123938560486, "timestamp": "2025-09-05 09:00:41.711449", "step": 1488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:41.917243", "step": 1488, "epoch": 2 }, { "type": "loss", "content": 0.342913955450058, "timestamp": "2025-09-05 09:00:41.920331", "step": 1489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:42.129925", "step": 1489, "epoch": 2 }, { "type": "loss", "content": 0.31036046147346497, "timestamp": "2025-09-05 09:00:42.131756", "step": 1490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:42.339428", "step": 1490, "epoch": 2 }, { "type": "loss", "content": 0.24073675274848938, "timestamp": "2025-09-05 09:00:42.415206", "step": 1491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:42.628445", "step": 1491, "epoch": 2 }, { "type": "loss", "content": 0.43674877285957336, "timestamp": "2025-09-05 09:00:42.644955", "step": 1492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:42.928380", "step": 1492, "epoch": 2 }, { "type": "loss", "content": 0.2842003107070923, "timestamp": "2025-09-05 09:00:42.930866", "step": 1493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:43.130467", "step": 1493, "epoch": 2 }, { "type": "loss", "content": 0.2694554924964905, "timestamp": "2025-09-05 09:00:43.132760", "step": 1494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:43.344581", "step": 1494, "epoch": 2 }, { "type": "loss", "content": 0.4056885540485382, "timestamp": "2025-09-05 09:00:43.387356", "step": 1495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:43.636302", "step": 1495, "epoch": 2 }, { "type": "loss", "content": 0.4579038918018341, "timestamp": "2025-09-05 09:00:43.652646", "step": 1496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:43.851183", "step": 1496, "epoch": 2 }, { "type": "loss", "content": 0.31665748357772827, "timestamp": "2025-09-05 09:00:43.853144", "step": 1497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:44.062967", "step": 1497, "epoch": 2 }, { "type": "loss", "content": 0.28368595242500305, "timestamp": "2025-09-05 09:00:44.065000", "step": 1498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:44.266543", "step": 1498, "epoch": 2 }, { "type": "loss", "content": 0.36625730991363525, "timestamp": "2025-09-05 09:00:44.268982", "step": 1499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:44.467553", "step": 1499, "epoch": 2 }, { "type": "loss", "content": 0.35754650831222534, "timestamp": "2025-09-05 09:00:44.484265", "step": 1500, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:00:49.578608", "step": 1500, "epoch": 2 }, { "type": "pplx", "content": 57.19569553462994, "timestamp": "2025-09-05 09:00:49.580840", "step": 1500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:49.743136", "step": 1500, "epoch": 2 }, { "type": "loss", "content": 0.2610255479812622, "timestamp": "2025-09-05 09:00:49.745210", "step": 1501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:49.911404", "step": 1501, "epoch": 2 }, { "type": "loss", "content": 0.3230501115322113, "timestamp": "2025-09-05 09:00:49.913745", "step": 1502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:00:50.122490", "step": 1502, "epoch": 2 }, { "type": "loss", "content": 0.42344826459884644, "timestamp": "2025-09-05 09:00:50.124894", "step": 1503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:50.448049", "step": 1503, "epoch": 2 }, { "type": "loss", "content": 0.27271905541419983, "timestamp": "2025-09-05 09:00:50.504586", "step": 1504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:50.761945", "step": 1504, "epoch": 2 }, { "type": "loss", "content": 0.38530251383781433, "timestamp": "2025-09-05 09:00:50.763482", "step": 1505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:50.933199", "step": 1505, "epoch": 2 }, { "type": "loss", "content": 0.19776545464992523, "timestamp": "2025-09-05 09:00:50.934817", "step": 1506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:51.142295", "step": 1506, "epoch": 2 }, { "type": "loss", "content": 0.4128006398677826, "timestamp": "2025-09-05 09:00:51.144305", "step": 1507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:51.342140", "step": 1507, "epoch": 2 }, { "type": "loss", "content": 0.2871294915676117, "timestamp": "2025-09-05 09:00:51.358189", "step": 1508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:51.547224", "step": 1508, "epoch": 2 }, { "type": "loss", "content": 0.24478839337825775, "timestamp": "2025-09-05 09:00:51.548878", "step": 1509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:51.715852", "step": 1509, "epoch": 2 }, { "type": "loss", "content": 0.2897082269191742, "timestamp": "2025-09-05 09:00:51.717461", "step": 1510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:51.887071", "step": 1510, "epoch": 2 }, { "type": "loss", "content": 0.2650999426841736, "timestamp": "2025-09-05 09:00:51.888963", "step": 1511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:52.083755", "step": 1511, "epoch": 2 }, { "type": "loss", "content": 0.33520272374153137, "timestamp": "2025-09-05 09:00:52.100558", "step": 1512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:52.297285", "step": 1512, "epoch": 2 }, { "type": "loss", "content": 0.2601361572742462, "timestamp": "2025-09-05 09:00:52.298983", "step": 1513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:52.494605", "step": 1513, "epoch": 2 }, { "type": "loss", "content": 0.3293619453907013, "timestamp": "2025-09-05 09:00:52.496222", "step": 1514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:00:52.703711", "step": 1514, "epoch": 2 }, { "type": "loss", "content": 0.3966652750968933, "timestamp": "2025-09-05 09:00:52.706032", "step": 1515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:52.902753", "step": 1515, "epoch": 2 }, { "type": "loss", "content": 0.46486595273017883, "timestamp": "2025-09-05 09:00:52.912613", "step": 1516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:00:53.075232", "step": 1516, "epoch": 2 }, { "type": "loss", "content": 0.2847346067428589, "timestamp": "2025-09-05 09:00:53.077884", "step": 1517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:53.247343", "step": 1517, "epoch": 2 }, { "type": "loss", "content": 0.3045147657394409, "timestamp": "2025-09-05 09:00:53.264732", "step": 1518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:53.472369", "step": 1518, "epoch": 2 }, { "type": "loss", "content": 0.34377965331077576, "timestamp": "2025-09-05 09:00:53.474258", "step": 1519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:53.681046", "step": 1519, "epoch": 2 }, { "type": "loss", "content": 0.353103905916214, "timestamp": "2025-09-05 09:00:53.695217", "step": 1520, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:00:58.747097", "step": 1520, "epoch": 2 }, { "type": "pplx", "content": 57.199012932244166, "timestamp": "2025-09-05 09:00:58.749409", "step": 1520, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1520", "timestamp": "2025-09-05 09:00:59.347636", "step": 1520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:00:59.530819", "step": 1520, "epoch": 2 }, { "type": "loss", "content": 0.2339445799589157, "timestamp": "2025-09-05 09:00:59.532778", "step": 1521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:59.789659", "step": 1521, "epoch": 2 }, { "type": "loss", "content": 0.41792815923690796, "timestamp": "2025-09-05 09:00:59.791267", "step": 1522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:00:59.997972", "step": 1522, "epoch": 2 }, { "type": "loss", "content": 0.26164618134498596, "timestamp": "2025-09-05 09:01:00.001215", "step": 1523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:00.201106", "step": 1523, "epoch": 2 }, { "type": "loss", "content": 0.3471045196056366, "timestamp": "2025-09-05 09:01:00.215901", "step": 1524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:00.461448", "step": 1524, "epoch": 2 }, { "type": "loss", "content": 0.38675960898399353, "timestamp": "2025-09-05 09:01:00.464367", "step": 1525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:00.672850", "step": 1525, "epoch": 2 }, { "type": "loss", "content": 0.36886975169181824, "timestamp": "2025-09-05 09:01:00.674827", "step": 1526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:00.875762", "step": 1526, "epoch": 2 }, { "type": "loss", "content": 0.2916344106197357, "timestamp": "2025-09-05 09:01:00.877994", "step": 1527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:01.186868", "step": 1527, "epoch": 2 }, { "type": "loss", "content": 0.2596033215522766, "timestamp": "2025-09-05 09:01:01.245041", "step": 1528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:01.553114", "step": 1528, "epoch": 2 }, { "type": "loss", "content": 0.41993069648742676, "timestamp": "2025-09-05 09:01:01.554722", "step": 1529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:01.762589", "step": 1529, "epoch": 2 }, { "type": "loss", "content": 0.3928647041320801, "timestamp": "2025-09-05 09:01:01.764333", "step": 1530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:01.963776", "step": 1530, "epoch": 2 }, { "type": "loss", "content": 0.24546408653259277, "timestamp": "2025-09-05 09:01:01.966150", "step": 1531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:02.164161", "step": 1531, "epoch": 2 }, { "type": "loss", "content": 0.3285897970199585, "timestamp": "2025-09-05 09:01:02.180529", "step": 1532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:02.380667", "step": 1532, "epoch": 2 }, { "type": "loss", "content": 0.23597146570682526, "timestamp": "2025-09-05 09:01:02.382749", "step": 1533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:02.592689", "step": 1533, "epoch": 2 }, { "type": "loss", "content": 0.2703210413455963, "timestamp": "2025-09-05 09:01:02.594777", "step": 1534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:02.845322", "step": 1534, "epoch": 2 }, { "type": "loss", "content": 0.3228624165058136, "timestamp": "2025-09-05 09:01:02.847462", "step": 1535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:03.047096", "step": 1535, "epoch": 2 }, { "type": "loss", "content": 0.3633241355419159, "timestamp": "2025-09-05 09:01:03.061335", "step": 1536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:03.258802", "step": 1536, "epoch": 2 }, { "type": "loss", "content": 0.3116025924682617, "timestamp": "2025-09-05 09:01:03.261151", "step": 1537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:03.505735", "step": 1537, "epoch": 2 }, { "type": "loss", "content": 0.2656325101852417, "timestamp": "2025-09-05 09:01:03.507750", "step": 1538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:03.714858", "step": 1538, "epoch": 2 }, { "type": "loss", "content": 0.502154529094696, "timestamp": "2025-09-05 09:01:03.757830", "step": 1539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:04.001581", "step": 1539, "epoch": 2 }, { "type": "loss", "content": 0.25501522421836853, "timestamp": "2025-09-05 09:01:04.056208", "step": 1540, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:01:09.059460", "step": 1540, "epoch": 2 }, { "type": "pplx", "content": 56.22339176817989, "timestamp": "2025-09-05 09:01:09.061549", "step": 1540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:09.222637", "step": 1540, "epoch": 2 }, { "type": "loss", "content": 0.33964210748672485, "timestamp": "2025-09-05 09:01:09.224561", "step": 1541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:09.390371", "step": 1541, "epoch": 2 }, { "type": "loss", "content": 0.3216552436351776, "timestamp": "2025-09-05 09:01:09.392053", "step": 1542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:09.609437", "step": 1542, "epoch": 2 }, { "type": "loss", "content": 0.46677759289741516, "timestamp": "2025-09-05 09:01:09.611720", "step": 1543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:09.820890", "step": 1543, "epoch": 2 }, { "type": "loss", "content": 0.4672287404537201, "timestamp": "2025-09-05 09:01:09.835069", "step": 1544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:10.024285", "step": 1544, "epoch": 2 }, { "type": "loss", "content": 0.31345078349113464, "timestamp": "2025-09-05 09:01:10.026199", "step": 1545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:10.235065", "step": 1545, "epoch": 2 }, { "type": "loss", "content": 0.25660619139671326, "timestamp": "2025-09-05 09:01:10.237026", "step": 1546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:10.441232", "step": 1546, "epoch": 2 }, { "type": "loss", "content": 0.2932271957397461, "timestamp": "2025-09-05 09:01:10.443096", "step": 1547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:10.609307", "step": 1547, "epoch": 2 }, { "type": "loss", "content": 0.28923869132995605, "timestamp": "2025-09-05 09:01:10.626261", "step": 1548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:10.873301", "step": 1548, "epoch": 2 }, { "type": "loss", "content": 0.2676112949848175, "timestamp": "2025-09-05 09:01:10.875539", "step": 1549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:11.121598", "step": 1549, "epoch": 2 }, { "type": "loss", "content": 0.24210013449192047, "timestamp": "2025-09-05 09:01:11.123753", "step": 1550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:11.320871", "step": 1550, "epoch": 2 }, { "type": "loss", "content": 0.31983768939971924, "timestamp": "2025-09-05 09:01:11.322439", "step": 1551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:11.529052", "step": 1551, "epoch": 2 }, { "type": "loss", "content": 0.2809242904186249, "timestamp": "2025-09-05 09:01:11.543506", "step": 1552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:11.788938", "step": 1552, "epoch": 2 }, { "type": "loss", "content": 0.3017968237400055, "timestamp": "2025-09-05 09:01:11.791257", "step": 1553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:12.041437", "step": 1553, "epoch": 2 }, { "type": "loss", "content": 0.32433098554611206, "timestamp": "2025-09-05 09:01:12.043576", "step": 1554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:12.248072", "step": 1554, "epoch": 2 }, { "type": "loss", "content": 0.5199801921844482, "timestamp": "2025-09-05 09:01:12.249996", "step": 1555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:01:12.416761", "step": 1555, "epoch": 2 }, { "type": "loss", "content": 0.303520530462265, "timestamp": "2025-09-05 09:01:12.476023", "step": 1556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:12.719674", "step": 1556, "epoch": 2 }, { "type": "loss", "content": 0.4727158844470978, "timestamp": "2025-09-05 09:01:12.721369", "step": 1557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:12.920588", "step": 1557, "epoch": 2 }, { "type": "loss", "content": 0.3274668753147125, "timestamp": "2025-09-05 09:01:12.922360", "step": 1558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:13.120064", "step": 1558, "epoch": 2 }, { "type": "loss", "content": 0.35360151529312134, "timestamp": "2025-09-05 09:01:13.121761", "step": 1559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:13.304478", "step": 1559, "epoch": 2 }, { "type": "loss", "content": 0.4361201524734497, "timestamp": "2025-09-05 09:01:13.313794", "step": 1560, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:01:18.338585", "step": 1560, "epoch": 2 }, { "type": "pplx", "content": 56.41456573621123, "timestamp": "2025-09-05 09:01:18.350072", "step": 1560, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1560", "timestamp": "2025-09-05 09:01:18.864357", "step": 1560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:19.052137", "step": 1560, "epoch": 2 }, { "type": "loss", "content": 0.3437816798686981, "timestamp": "2025-09-05 09:01:19.086911", "step": 1561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:19.346975", "step": 1561, "epoch": 2 }, { "type": "loss", "content": 0.3193230926990509, "timestamp": "2025-09-05 09:01:19.348708", "step": 1562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:19.554605", "step": 1562, "epoch": 2 }, { "type": "loss", "content": 0.3251970708370209, "timestamp": "2025-09-05 09:01:19.560596", "step": 1563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:19.760161", "step": 1563, "epoch": 2 }, { "type": "loss", "content": 0.3838941156864166, "timestamp": "2025-09-05 09:01:19.776545", "step": 1564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:19.974450", "step": 1564, "epoch": 2 }, { "type": "loss", "content": 0.3207242488861084, "timestamp": "2025-09-05 09:01:19.976164", "step": 1565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:20.242298", "step": 1565, "epoch": 2 }, { "type": "loss", "content": 0.3163876235485077, "timestamp": "2025-09-05 09:01:20.244368", "step": 1566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:20.440224", "step": 1566, "epoch": 2 }, { "type": "loss", "content": 0.2813356816768646, "timestamp": "2025-09-05 09:01:20.442252", "step": 1567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:20.647109", "step": 1567, "epoch": 2 }, { "type": "loss", "content": 0.19117353856563568, "timestamp": "2025-09-05 09:01:20.661404", "step": 1568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:20.888763", "step": 1568, "epoch": 2 }, { "type": "loss", "content": 0.34964126348495483, "timestamp": "2025-09-05 09:01:20.890577", "step": 1569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:21.138262", "step": 1569, "epoch": 2 }, { "type": "loss", "content": 0.3004063367843628, "timestamp": "2025-09-05 09:01:21.141055", "step": 1570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:21.335800", "step": 1570, "epoch": 2 }, { "type": "loss", "content": 0.3171744644641876, "timestamp": "2025-09-05 09:01:21.339587", "step": 1571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:21.533662", "step": 1571, "epoch": 2 }, { "type": "loss", "content": 0.47390928864479065, "timestamp": "2025-09-05 09:01:21.551024", "step": 1572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:21.746161", "step": 1572, "epoch": 2 }, { "type": "loss", "content": 0.34400877356529236, "timestamp": "2025-09-05 09:01:21.749209", "step": 1573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:21.914493", "step": 1573, "epoch": 2 }, { "type": "loss", "content": 0.3936767876148224, "timestamp": "2025-09-05 09:01:21.917943", "step": 1574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:22.122702", "step": 1574, "epoch": 2 }, { "type": "loss", "content": 0.2832147181034088, "timestamp": "2025-09-05 09:01:22.125039", "step": 1575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:22.291669", "step": 1575, "epoch": 2 }, { "type": "loss", "content": 0.2926376760005951, "timestamp": "2025-09-05 09:01:22.309042", "step": 1576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:22.551858", "step": 1576, "epoch": 2 }, { "type": "loss", "content": 0.4421359896659851, "timestamp": "2025-09-05 09:01:22.555003", "step": 1577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:22.749743", "step": 1577, "epoch": 2 }, { "type": "loss", "content": 0.3380582928657532, "timestamp": "2025-09-05 09:01:22.752462", "step": 1578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:22.949713", "step": 1578, "epoch": 2 }, { "type": "loss", "content": 0.23209118843078613, "timestamp": "2025-09-05 09:01:22.952426", "step": 1579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:23.244611", "step": 1579, "epoch": 2 }, { "type": "loss", "content": 0.29452741146087646, "timestamp": "2025-09-05 09:01:23.260250", "step": 1580, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:01:28.387804", "step": 1580, "epoch": 2 }, { "type": "pplx", "content": 56.31358010394533, "timestamp": "2025-09-05 09:01:28.390146", "step": 1580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:28.552853", "step": 1580, "epoch": 2 }, { "type": "loss", "content": 0.22305051982402802, "timestamp": "2025-09-05 09:01:28.555962", "step": 1581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:28.762299", "step": 1581, "epoch": 2 }, { "type": "loss", "content": 0.4951125383377075, "timestamp": "2025-09-05 09:01:28.765755", "step": 1582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:28.965837", "step": 1582, "epoch": 2 }, { "type": "loss", "content": 0.2661086320877075, "timestamp": "2025-09-05 09:01:28.967787", "step": 1583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:29.150701", "step": 1583, "epoch": 2 }, { "type": "loss", "content": 0.3857158124446869, "timestamp": "2025-09-05 09:01:29.160993", "step": 1584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:29.324956", "step": 1584, "epoch": 2 }, { "type": "loss", "content": 0.32872211933135986, "timestamp": "2025-09-05 09:01:29.326514", "step": 1585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:01:29.531376", "step": 1585, "epoch": 2 }, { "type": "loss", "content": 0.3093208074569702, "timestamp": "2025-09-05 09:01:29.533473", "step": 1586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:29.733192", "step": 1586, "epoch": 2 }, { "type": "loss", "content": 0.29720935225486755, "timestamp": "2025-09-05 09:01:29.735435", "step": 1587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:29.930239", "step": 1587, "epoch": 2 }, { "type": "loss", "content": 0.3052479922771454, "timestamp": "2025-09-05 09:01:29.946354", "step": 1588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:30.138120", "step": 1588, "epoch": 2 }, { "type": "loss", "content": 0.3246830105781555, "timestamp": "2025-09-05 09:01:30.140093", "step": 1589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:30.336315", "step": 1589, "epoch": 2 }, { "type": "loss", "content": 0.4002441465854645, "timestamp": "2025-09-05 09:01:30.338455", "step": 1590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:30.534261", "step": 1590, "epoch": 2 }, { "type": "loss", "content": 0.34371864795684814, "timestamp": "2025-09-05 09:01:30.577760", "step": 1591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:30.775757", "step": 1591, "epoch": 2 }, { "type": "loss", "content": 0.30698439478874207, "timestamp": "2025-09-05 09:01:30.790514", "step": 1592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:01:30.980985", "step": 1592, "epoch": 2 }, { "type": "loss", "content": 0.4689514636993408, "timestamp": "2025-09-05 09:01:30.982847", "step": 1593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:31.187822", "step": 1593, "epoch": 2 }, { "type": "loss", "content": 0.3119965195655823, "timestamp": "2025-09-05 09:01:31.189428", "step": 1594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:31.384273", "step": 1594, "epoch": 2 }, { "type": "loss", "content": 0.4260518550872803, "timestamp": "2025-09-05 09:01:31.386455", "step": 1595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:31.584055", "step": 1595, "epoch": 2 }, { "type": "loss", "content": 0.42162322998046875, "timestamp": "2025-09-05 09:01:31.598523", "step": 1596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:31.842004", "step": 1596, "epoch": 2 }, { "type": "loss", "content": 0.3950115144252777, "timestamp": "2025-09-05 09:01:31.844535", "step": 1597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:32.042926", "step": 1597, "epoch": 2 }, { "type": "loss", "content": 0.3251371681690216, "timestamp": "2025-09-05 09:01:32.045798", "step": 1598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:32.242051", "step": 1598, "epoch": 2 }, { "type": "loss", "content": 0.4372723698616028, "timestamp": "2025-09-05 09:01:32.245518", "step": 1599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:32.442582", "step": 1599, "epoch": 2 }, { "type": "loss", "content": 0.2939607799053192, "timestamp": "2025-09-05 09:01:32.459198", "step": 1600, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:01:37.613280", "step": 1600, "epoch": 2 }, { "type": "pplx", "content": 56.03260913326416, "timestamp": "2025-09-05 09:01:37.615081", "step": 1600, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1600", "timestamp": "2025-09-05 09:01:38.059228", "step": 1600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:01:38.220521", "step": 1600, "epoch": 2 }, { "type": "loss", "content": 0.4079391062259674, "timestamp": "2025-09-05 09:01:38.222270", "step": 1601, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:38.426775", "step": 1601, "epoch": 2 }, { "type": "loss", "content": 0.26604366302490234, "timestamp": "2025-09-05 09:01:38.428405", "step": 1602, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:38.632567", "step": 1602, "epoch": 2 }, { "type": "loss", "content": 0.270683616399765, "timestamp": "2025-09-05 09:01:38.634094", "step": 1603, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:38.832221", "step": 1603, "epoch": 2 }, { "type": "loss", "content": 0.3520897924900055, "timestamp": "2025-09-05 09:01:38.846486", "step": 1604, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:39.035039", "step": 1604, "epoch": 2 }, { "type": "loss", "content": 0.4065357744693756, "timestamp": "2025-09-05 09:01:39.037089", "step": 1605, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:01:39.231198", "step": 1605, "epoch": 2 }, { "type": "loss", "content": 0.4396074414253235, "timestamp": "2025-09-05 09:01:39.233010", "step": 1606, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:39.432802", "step": 1606, "epoch": 2 }, { "type": "loss", "content": 0.19596950709819794, "timestamp": "2025-09-05 09:01:39.434547", "step": 1607, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:39.683964", "step": 1607, "epoch": 2 }, { "type": "loss", "content": 0.34500351548194885, "timestamp": "2025-09-05 09:01:39.698962", "step": 1608, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:39.890897", "step": 1608, "epoch": 2 }, { "type": "loss", "content": 0.29165971279144287, "timestamp": "2025-09-05 09:01:39.892845", "step": 1609, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:40.091985", "step": 1609, "epoch": 2 }, { "type": "loss", "content": 0.41614753007888794, "timestamp": "2025-09-05 09:01:40.094774", "step": 1610, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:40.345641", "step": 1610, "epoch": 2 }, { "type": "loss", "content": 0.35462021827697754, "timestamp": "2025-09-05 09:01:40.349976", "step": 1611, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:01:40.611172", "step": 1611, "epoch": 2 }, { "type": "loss", "content": 0.440545916557312, "timestamp": "2025-09-05 09:01:40.626023", "step": 1612, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:40.824860", "step": 1612, "epoch": 2 }, { "type": "loss", "content": 0.30644723773002625, "timestamp": "2025-09-05 09:01:40.827413", "step": 1613, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:41.078955", "step": 1613, "epoch": 2 }, { "type": "loss", "content": 0.31383422017097473, "timestamp": "2025-09-05 09:01:41.081534", "step": 1614, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:41.278142", "step": 1614, "epoch": 2 }, { "type": "loss", "content": 0.4448195695877075, "timestamp": "2025-09-05 09:01:41.280472", "step": 1615, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:41.473811", "step": 1615, "epoch": 2 }, { "type": "loss", "content": 0.2656569480895996, "timestamp": "2025-09-05 09:01:41.488401", "step": 1616, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:41.678286", "step": 1616, "epoch": 2 }, { "type": "loss", "content": 0.27651652693748474, "timestamp": "2025-09-05 09:01:41.680091", "step": 1617, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:41.929276", "step": 1617, "epoch": 2 }, { "type": "loss", "content": 0.22921140491962433, "timestamp": "2025-09-05 09:01:41.931200", "step": 1618, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:42.128588", "step": 1618, "epoch": 2 }, { "type": "loss", "content": 0.24880188703536987, "timestamp": "2025-09-05 09:01:42.130379", "step": 1619, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:42.339134", "step": 1619, "epoch": 2 }, { "type": "loss", "content": 0.27319225668907166, "timestamp": "2025-09-05 09:01:42.395861", "step": 1620, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:01:47.860884", "step": 1620, "epoch": 2 }, { "type": "pplx", "content": 55.03103150706018, "timestamp": "2025-09-05 09:01:47.862612", "step": 1620, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:48.026237", "step": 1620, "epoch": 2 }, { "type": "loss", "content": 0.2473212331533432, "timestamp": "2025-09-05 09:01:48.069969", "step": 1621, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:48.239318", "step": 1621, "epoch": 2 }, { "type": "loss", "content": 0.30699431896209717, "timestamp": "2025-09-05 09:01:48.241670", "step": 1622, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:48.447505", "step": 1622, "epoch": 2 }, { "type": "loss", "content": 0.47052350640296936, "timestamp": "2025-09-05 09:01:48.449557", "step": 1623, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:48.656637", "step": 1623, "epoch": 2 }, { "type": "loss", "content": 0.3628225028514862, "timestamp": "2025-09-05 09:01:48.714869", "step": 1624, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:48.914680", "step": 1624, "epoch": 2 }, { "type": "loss", "content": 0.26334837079048157, "timestamp": "2025-09-05 09:01:48.916727", "step": 1625, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:49.115550", "step": 1625, "epoch": 2 }, { "type": "loss", "content": 0.37516269087791443, "timestamp": "2025-09-05 09:01:49.117110", "step": 1626, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:01:49.315505", "step": 1626, "epoch": 2 }, { "type": "loss", "content": 0.5237755179405212, "timestamp": "2025-09-05 09:01:49.317923", "step": 1627, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:49.526667", "step": 1627, "epoch": 2 }, { "type": "loss", "content": 0.20416662096977234, "timestamp": "2025-09-05 09:01:49.540424", "step": 1628, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:49.731734", "step": 1628, "epoch": 2 }, { "type": "loss", "content": 0.3799273371696472, "timestamp": "2025-09-05 09:01:49.733539", "step": 1629, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:49.928401", "step": 1629, "epoch": 2 }, { "type": "loss", "content": 0.4185466170310974, "timestamp": "2025-09-05 09:01:49.930768", "step": 1630, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:50.137540", "step": 1630, "epoch": 2 }, { "type": "loss", "content": 0.28220972418785095, "timestamp": "2025-09-05 09:01:50.140239", "step": 1631, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:50.348233", "step": 1631, "epoch": 2 }, { "type": "loss", "content": 0.2010842263698578, "timestamp": "2025-09-05 09:01:50.363989", "step": 1632, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:50.607652", "step": 1632, "epoch": 2 }, { "type": "loss", "content": 0.29958879947662354, "timestamp": "2025-09-05 09:01:50.610206", "step": 1633, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:50.808964", "step": 1633, "epoch": 2 }, { "type": "loss", "content": 0.33167004585266113, "timestamp": "2025-09-05 09:01:50.811115", "step": 1634, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:51.009267", "step": 1634, "epoch": 2 }, { "type": "loss", "content": 0.24998310208320618, "timestamp": "2025-09-05 09:01:51.031265", "step": 1635, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:51.282600", "step": 1635, "epoch": 2 }, { "type": "loss", "content": 0.311273455619812, "timestamp": "2025-09-05 09:01:51.297279", "step": 1636, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:51.488464", "step": 1636, "epoch": 2 }, { "type": "loss", "content": 0.28948909044265747, "timestamp": "2025-09-05 09:01:51.490652", "step": 1637, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:01:51.746233", "step": 1637, "epoch": 2 }, { "type": "loss", "content": 0.2414962649345398, "timestamp": "2025-09-05 09:01:51.748103", "step": 1638, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:51.945697", "step": 1638, "epoch": 2 }, { "type": "loss", "content": 0.26412102580070496, "timestamp": "2025-09-05 09:01:51.947451", "step": 1639, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:52.142544", "step": 1639, "epoch": 2 }, { "type": "loss", "content": 0.27432331442832947, "timestamp": "2025-09-05 09:01:52.160428", "step": 1640, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:01:56.942968", "step": 1640, "epoch": 2 }, { "type": "pplx", "content": 54.70964771417281, "timestamp": "2025-09-05 09:01:56.947175", "step": 1640, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1640", "timestamp": "2025-09-05 09:01:57.598988", "step": 1640, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:57.841522", "step": 1640, "epoch": 2 }, { "type": "loss", "content": 0.32458359003067017, "timestamp": "2025-09-05 09:01:57.843776", "step": 1641, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:58.040943", "step": 1641, "epoch": 2 }, { "type": "loss", "content": 0.4251089096069336, "timestamp": "2025-09-05 09:01:58.042669", "step": 1642, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:58.240063", "step": 1642, "epoch": 2 }, { "type": "loss", "content": 0.4108656942844391, "timestamp": "2025-09-05 09:01:58.241816", "step": 1643, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:58.437720", "step": 1643, "epoch": 2 }, { "type": "loss", "content": 0.38851895928382874, "timestamp": "2025-09-05 09:01:58.454504", "step": 1644, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:58.651986", "step": 1644, "epoch": 2 }, { "type": "loss", "content": 0.2178976833820343, "timestamp": "2025-09-05 09:01:58.654067", "step": 1645, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:01:58.822924", "step": 1645, "epoch": 2 }, { "type": "loss", "content": 0.35721564292907715, "timestamp": "2025-09-05 09:01:58.825316", "step": 1646, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:01:59.044925", "step": 1646, "epoch": 2 }, { "type": "loss", "content": 0.17846766114234924, "timestamp": "2025-09-05 09:01:59.047352", "step": 1647, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:59.215050", "step": 1647, "epoch": 2 }, { "type": "loss", "content": 0.38131412863731384, "timestamp": "2025-09-05 09:01:59.231949", "step": 1648, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:01:59.428199", "step": 1648, "epoch": 2 }, { "type": "loss", "content": 0.16115841269493103, "timestamp": "2025-09-05 09:01:59.430055", "step": 1649, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:01:59.636473", "step": 1649, "epoch": 2 }, { "type": "loss", "content": 0.45021679997444153, "timestamp": "2025-09-05 09:01:59.638966", "step": 1650, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:01:59.848917", "step": 1650, "epoch": 2 }, { "type": "loss", "content": 0.3558279871940613, "timestamp": "2025-09-05 09:01:59.851023", "step": 1651, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:00.096329", "step": 1651, "epoch": 2 }, { "type": "loss", "content": 0.40276896953582764, "timestamp": "2025-09-05 09:02:00.113286", "step": 1652, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:00.312785", "step": 1652, "epoch": 2 }, { "type": "loss", "content": 0.29273685812950134, "timestamp": "2025-09-05 09:02:00.314731", "step": 1653, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:00.563338", "step": 1653, "epoch": 2 }, { "type": "loss", "content": 0.4775174558162689, "timestamp": "2025-09-05 09:02:00.565910", "step": 1654, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:00.763870", "step": 1654, "epoch": 2 }, { "type": "loss", "content": 0.3428560793399811, "timestamp": "2025-09-05 09:02:00.767063", "step": 1655, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:01.016867", "step": 1655, "epoch": 2 }, { "type": "loss", "content": 0.3681947588920593, "timestamp": "2025-09-05 09:02:01.031643", "step": 1656, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:01.218711", "step": 1656, "epoch": 2 }, { "type": "loss", "content": 0.4562534689903259, "timestamp": "2025-09-05 09:02:01.220599", "step": 1657, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:01.606943", "step": 1657, "epoch": 2 }, { "type": "loss", "content": 0.38250502943992615, "timestamp": "2025-09-05 09:02:01.609870", "step": 1658, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:01.806481", "step": 1658, "epoch": 2 }, { "type": "loss", "content": 0.4949440658092499, "timestamp": "2025-09-05 09:02:01.809636", "step": 1659, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:02.006945", "step": 1659, "epoch": 2 }, { "type": "loss", "content": 0.30622151494026184, "timestamp": "2025-09-05 09:02:02.022446", "step": 1660, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:02:07.306404", "step": 1660, "epoch": 2 }, { "type": "pplx", "content": 55.28318964602614, "timestamp": "2025-09-05 09:02:07.308770", "step": 1660, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:07.471588", "step": 1660, "epoch": 2 }, { "type": "loss", "content": 0.41278305649757385, "timestamp": "2025-09-05 09:02:07.473751", "step": 1661, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:07.641861", "step": 1661, "epoch": 2 }, { "type": "loss", "content": 0.2784328758716583, "timestamp": "2025-09-05 09:02:07.644612", "step": 1662, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:07.853696", "step": 1662, "epoch": 2 }, { "type": "loss", "content": 0.3032529056072235, "timestamp": "2025-09-05 09:02:07.855716", "step": 1663, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:08.056549", "step": 1663, "epoch": 2 }, { "type": "loss", "content": 0.3818753659725189, "timestamp": "2025-09-05 09:02:08.071192", "step": 1664, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:08.260992", "step": 1664, "epoch": 2 }, { "type": "loss", "content": 0.49340173602104187, "timestamp": "2025-09-05 09:02:08.263931", "step": 1665, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:08.471439", "step": 1665, "epoch": 2 }, { "type": "loss", "content": 0.30880993604660034, "timestamp": "2025-09-05 09:02:08.474238", "step": 1666, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:08.681287", "step": 1666, "epoch": 2 }, { "type": "loss", "content": 0.34679117798805237, "timestamp": "2025-09-05 09:02:08.683188", "step": 1667, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:08.890210", "step": 1667, "epoch": 2 }, { "type": "loss", "content": 0.2942996919155121, "timestamp": "2025-09-05 09:02:08.905288", "step": 1668, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:09.094441", "step": 1668, "epoch": 2 }, { "type": "loss", "content": 0.43989962339401245, "timestamp": "2025-09-05 09:02:09.096279", "step": 1669, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:09.302837", "step": 1669, "epoch": 2 }, { "type": "loss", "content": 0.30262741446495056, "timestamp": "2025-09-05 09:02:09.310213", "step": 1670, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:02:09.509474", "step": 1670, "epoch": 2 }, { "type": "loss", "content": 0.2830575406551361, "timestamp": "2025-09-05 09:02:09.511875", "step": 1671, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:09.708618", "step": 1671, "epoch": 2 }, { "type": "loss", "content": 0.23973654210567474, "timestamp": "2025-09-05 09:02:09.723564", "step": 1672, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:09.912351", "step": 1672, "epoch": 2 }, { "type": "loss", "content": 0.4138979911804199, "timestamp": "2025-09-05 09:02:09.914678", "step": 1673, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:10.120830", "step": 1673, "epoch": 2 }, { "type": "loss", "content": 0.33984220027923584, "timestamp": "2025-09-05 09:02:10.122661", "step": 1674, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:10.330661", "step": 1674, "epoch": 2 }, { "type": "loss", "content": 0.4164768159389496, "timestamp": "2025-09-05 09:02:10.332757", "step": 1675, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:10.528555", "step": 1675, "epoch": 2 }, { "type": "loss", "content": 0.38378530740737915, "timestamp": "2025-09-05 09:02:10.545099", "step": 1676, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:10.752714", "step": 1676, "epoch": 2 }, { "type": "loss", "content": 0.4028756320476532, "timestamp": "2025-09-05 09:02:10.755044", "step": 1677, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:10.948364", "step": 1677, "epoch": 2 }, { "type": "loss", "content": 0.31969425082206726, "timestamp": "2025-09-05 09:02:10.950693", "step": 1678, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:11.157995", "step": 1678, "epoch": 2 }, { "type": "loss", "content": 0.41612645983695984, "timestamp": "2025-09-05 09:02:11.160072", "step": 1679, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:11.366276", "step": 1679, "epoch": 2 }, { "type": "loss", "content": 0.3250633478164673, "timestamp": "2025-09-05 09:02:11.381192", "step": 1680, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:02:16.366281", "step": 1680, "epoch": 2 }, { "type": "pplx", "content": 54.385877678644015, "timestamp": "2025-09-05 09:02:16.371386", "step": 1680, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1680", "timestamp": "2025-09-05 09:02:16.822315", "step": 1680, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:17.010071", "step": 1680, "epoch": 2 }, { "type": "loss", "content": 0.22635141015052795, "timestamp": "2025-09-05 09:02:17.013493", "step": 1681, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:17.221771", "step": 1681, "epoch": 2 }, { "type": "loss", "content": 0.3104085326194763, "timestamp": "2025-09-05 09:02:17.223669", "step": 1682, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:17.422856", "step": 1682, "epoch": 2 }, { "type": "loss", "content": 0.3125440776348114, "timestamp": "2025-09-05 09:02:17.424683", "step": 1683, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:17.590705", "step": 1683, "epoch": 2 }, { "type": "loss", "content": 0.28808438777923584, "timestamp": "2025-09-05 09:02:17.624104", "step": 1684, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:17.822056", "step": 1684, "epoch": 2 }, { "type": "loss", "content": 0.3280973434448242, "timestamp": "2025-09-05 09:02:17.824459", "step": 1685, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:18.074431", "step": 1685, "epoch": 2 }, { "type": "loss", "content": 0.4642024636268616, "timestamp": "2025-09-05 09:02:18.076468", "step": 1686, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:18.245941", "step": 1686, "epoch": 2 }, { "type": "loss", "content": 0.4451746940612793, "timestamp": "2025-09-05 09:02:18.248641", "step": 1687, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:18.445851", "step": 1687, "epoch": 2 }, { "type": "loss", "content": 0.4639004170894623, "timestamp": "2025-09-05 09:02:18.460296", "step": 1688, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:18.647761", "step": 1688, "epoch": 2 }, { "type": "loss", "content": 0.44137394428253174, "timestamp": "2025-09-05 09:02:18.649593", "step": 1689, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:18.855698", "step": 1689, "epoch": 2 }, { "type": "loss", "content": 0.3657119870185852, "timestamp": "2025-09-05 09:02:18.857681", "step": 1690, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:19.054479", "step": 1690, "epoch": 2 }, { "type": "loss", "content": 0.38898834586143494, "timestamp": "2025-09-05 09:02:19.056545", "step": 1691, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:19.252320", "step": 1691, "epoch": 2 }, { "type": "loss", "content": 0.3620879352092743, "timestamp": "2025-09-05 09:02:19.266955", "step": 1692, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:19.456051", "step": 1692, "epoch": 2 }, { "type": "loss", "content": 0.3081320822238922, "timestamp": "2025-09-05 09:02:19.457978", "step": 1693, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:19.625303", "step": 1693, "epoch": 2 }, { "type": "loss", "content": 0.4219552278518677, "timestamp": "2025-09-05 09:02:19.627423", "step": 1694, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:02:19.832534", "step": 1694, "epoch": 2 }, { "type": "loss", "content": 0.31051498651504517, "timestamp": "2025-09-05 09:02:19.834368", "step": 1695, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:20.039831", "step": 1695, "epoch": 2 }, { "type": "loss", "content": 0.18346603214740753, "timestamp": "2025-09-05 09:02:20.054596", "step": 1696, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:20.251930", "step": 1696, "epoch": 2 }, { "type": "loss", "content": 0.2537613809108734, "timestamp": "2025-09-05 09:02:20.253750", "step": 1697, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:20.450601", "step": 1697, "epoch": 2 }, { "type": "loss", "content": 0.4183553457260132, "timestamp": "2025-09-05 09:02:20.452396", "step": 1698, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:20.659171", "step": 1698, "epoch": 2 }, { "type": "loss", "content": 0.3628275990486145, "timestamp": "2025-09-05 09:02:20.661000", "step": 1699, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:20.856750", "step": 1699, "epoch": 2 }, { "type": "loss", "content": 0.3655959367752075, "timestamp": "2025-09-05 09:02:20.871615", "step": 1700, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:02:25.746505", "step": 1700, "epoch": 2 }, { "type": "pplx", "content": 52.8854128909846, "timestamp": "2025-09-05 09:02:25.748403", "step": 1700, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:25.911309", "step": 1700, "epoch": 2 }, { "type": "loss", "content": 0.5494905710220337, "timestamp": "2025-09-05 09:02:25.913231", "step": 1701, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:26.118423", "step": 1701, "epoch": 2 }, { "type": "loss", "content": 0.24261198937892914, "timestamp": "2025-09-05 09:02:26.121092", "step": 1702, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:26.317960", "step": 1702, "epoch": 2 }, { "type": "loss", "content": 0.3774292469024658, "timestamp": "2025-09-05 09:02:26.320170", "step": 1703, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:26.488714", "step": 1703, "epoch": 2 }, { "type": "loss", "content": 0.3856517970561981, "timestamp": "2025-09-05 09:02:26.506248", "step": 1704, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:26.695169", "step": 1704, "epoch": 2 }, { "type": "loss", "content": 0.31671613454818726, "timestamp": "2025-09-05 09:02:26.698931", "step": 1705, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:02:26.947442", "step": 1705, "epoch": 2 }, { "type": "loss", "content": 0.33507075905799866, "timestamp": "2025-09-05 09:02:26.980111", "step": 1706, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:27.187298", "step": 1706, "epoch": 2 }, { "type": "loss", "content": 0.32331711053848267, "timestamp": "2025-09-05 09:02:27.189573", "step": 1707, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:27.355286", "step": 1707, "epoch": 2 }, { "type": "loss", "content": 0.34161141514778137, "timestamp": "2025-09-05 09:02:27.372454", "step": 1708, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:27.568904", "step": 1708, "epoch": 2 }, { "type": "loss", "content": 0.29065045714378357, "timestamp": "2025-09-05 09:02:27.570686", "step": 1709, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:27.778591", "step": 1709, "epoch": 2 }, { "type": "loss", "content": 0.22849217057228088, "timestamp": "2025-09-05 09:02:27.780580", "step": 1710, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:27.947810", "step": 1710, "epoch": 2 }, { "type": "loss", "content": 0.3984873592853546, "timestamp": "2025-09-05 09:02:27.949766", "step": 1711, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:28.154332", "step": 1711, "epoch": 2 }, { "type": "loss", "content": 0.26712897419929504, "timestamp": "2025-09-05 09:02:28.168755", "step": 1712, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:28.358064", "step": 1712, "epoch": 2 }, { "type": "loss", "content": 0.33036503195762634, "timestamp": "2025-09-05 09:02:28.359870", "step": 1713, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:28.528281", "step": 1713, "epoch": 2 }, { "type": "loss", "content": 0.40827813744544983, "timestamp": "2025-09-05 09:02:28.531089", "step": 1714, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:28.785579", "step": 1714, "epoch": 2 }, { "type": "loss", "content": 0.30413565039634705, "timestamp": "2025-09-05 09:02:28.787924", "step": 1715, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:29.125532", "step": 1715, "epoch": 2 }, { "type": "loss", "content": 0.33635202050209045, "timestamp": "2025-09-05 09:02:29.141494", "step": 1716, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:02:29.384162", "step": 1716, "epoch": 2 }, { "type": "loss", "content": 0.30479493737220764, "timestamp": "2025-09-05 09:02:29.385834", "step": 1717, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:29.580567", "step": 1717, "epoch": 2 }, { "type": "loss", "content": 0.25971081852912903, "timestamp": "2025-09-05 09:02:29.582717", "step": 1718, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:29.788226", "step": 1718, "epoch": 2 }, { "type": "loss", "content": 0.33488330245018005, "timestamp": "2025-09-05 09:02:29.790269", "step": 1719, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:30.038248", "step": 1719, "epoch": 2 }, { "type": "loss", "content": 0.2982542812824249, "timestamp": "2025-09-05 09:02:30.053364", "step": 1720, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:02:36.028498", "step": 1720, "epoch": 2 }, { "type": "pplx", "content": 52.41845639341919, "timestamp": "2025-09-05 09:02:36.031743", "step": 1720, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1720", "timestamp": "2025-09-05 09:02:36.853432", "step": 1720, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:37.032163", "step": 1720, "epoch": 2 }, { "type": "loss", "content": 0.2861115038394928, "timestamp": "2025-09-05 09:02:37.034925", "step": 1721, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:37.234316", "step": 1721, "epoch": 2 }, { "type": "loss", "content": 0.22263473272323608, "timestamp": "2025-09-05 09:02:37.236395", "step": 1722, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:37.435471", "step": 1722, "epoch": 2 }, { "type": "loss", "content": 0.2188272327184677, "timestamp": "2025-09-05 09:02:37.437670", "step": 1723, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:37.636131", "step": 1723, "epoch": 2 }, { "type": "loss", "content": 0.3807090222835541, "timestamp": "2025-09-05 09:02:37.651194", "step": 1724, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:37.842885", "step": 1724, "epoch": 2 }, { "type": "loss", "content": 0.3141788840293884, "timestamp": "2025-09-05 09:02:37.844950", "step": 1725, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:38.041711", "step": 1725, "epoch": 2 }, { "type": "loss", "content": 0.28930506110191345, "timestamp": "2025-09-05 09:02:38.043434", "step": 1726, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:38.240729", "step": 1726, "epoch": 2 }, { "type": "loss", "content": 0.3060302734375, "timestamp": "2025-09-05 09:02:38.242833", "step": 1727, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:02:38.442684", "step": 1727, "epoch": 2 }, { "type": "loss", "content": 0.3646160662174225, "timestamp": "2025-09-05 09:02:38.459198", "step": 1728, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:38.657264", "step": 1728, "epoch": 2 }, { "type": "loss", "content": 0.4415394365787506, "timestamp": "2025-09-05 09:02:38.660657", "step": 1729, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:38.867898", "step": 1729, "epoch": 2 }, { "type": "loss", "content": 0.330495148897171, "timestamp": "2025-09-05 09:02:38.870259", "step": 1730, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:39.068825", "step": 1730, "epoch": 2 }, { "type": "loss", "content": 0.30471497774124146, "timestamp": "2025-09-05 09:02:39.070707", "step": 1731, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:39.267010", "step": 1731, "epoch": 2 }, { "type": "loss", "content": 0.3218672275543213, "timestamp": "2025-09-05 09:02:39.281489", "step": 1732, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:39.475538", "step": 1732, "epoch": 2 }, { "type": "loss", "content": 0.467593252658844, "timestamp": "2025-09-05 09:02:39.477300", "step": 1733, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:39.726323", "step": 1733, "epoch": 2 }, { "type": "loss", "content": 0.37956318259239197, "timestamp": "2025-09-05 09:02:39.728240", "step": 1734, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:39.925216", "step": 1734, "epoch": 2 }, { "type": "loss", "content": 0.4981067478656769, "timestamp": "2025-09-05 09:02:39.927497", "step": 1735, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:40.135632", "step": 1735, "epoch": 2 }, { "type": "loss", "content": 0.31048834323883057, "timestamp": "2025-09-05 09:02:40.150598", "step": 1736, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:40.366202", "step": 1736, "epoch": 2 }, { "type": "loss", "content": 0.283828467130661, "timestamp": "2025-09-05 09:02:40.368169", "step": 1737, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:40.535557", "step": 1737, "epoch": 2 }, { "type": "loss", "content": 0.25476568937301636, "timestamp": "2025-09-05 09:02:40.538151", "step": 1738, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:40.752855", "step": 1738, "epoch": 2 }, { "type": "loss", "content": 0.23773322999477386, "timestamp": "2025-09-05 09:02:40.754924", "step": 1739, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:40.962448", "step": 1739, "epoch": 2 }, { "type": "loss", "content": 0.3678232729434967, "timestamp": "2025-09-05 09:02:40.977673", "step": 1740, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:02:46.695748", "step": 1740, "epoch": 2 }, { "type": "pplx", "content": 52.87610880309225, "timestamp": "2025-09-05 09:02:46.697902", "step": 1740, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:46.860812", "step": 1740, "epoch": 2 }, { "type": "loss", "content": 0.42199501395225525, "timestamp": "2025-09-05 09:02:46.862869", "step": 1741, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:47.030713", "step": 1741, "epoch": 2 }, { "type": "loss", "content": 0.27012646198272705, "timestamp": "2025-09-05 09:02:47.048230", "step": 1742, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:47.253601", "step": 1742, "epoch": 2 }, { "type": "loss", "content": 0.33601266145706177, "timestamp": "2025-09-05 09:02:47.255971", "step": 1743, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:47.450875", "step": 1743, "epoch": 2 }, { "type": "loss", "content": 0.34322261810302734, "timestamp": "2025-09-05 09:02:47.466057", "step": 1744, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:47.707178", "step": 1744, "epoch": 2 }, { "type": "loss", "content": 0.21854346990585327, "timestamp": "2025-09-05 09:02:47.709562", "step": 1745, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:47.915477", "step": 1745, "epoch": 2 }, { "type": "loss", "content": 0.2908678352832794, "timestamp": "2025-09-05 09:02:47.917360", "step": 1746, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:02:48.084021", "step": 1746, "epoch": 2 }, { "type": "loss", "content": 0.22152143716812134, "timestamp": "2025-09-05 09:02:48.167207", "step": 1747, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:48.412311", "step": 1747, "epoch": 2 }, { "type": "loss", "content": 0.3264712691307068, "timestamp": "2025-09-05 09:02:48.427033", "step": 1748, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:48.615841", "step": 1748, "epoch": 2 }, { "type": "loss", "content": 0.33076176047325134, "timestamp": "2025-09-05 09:02:48.658513", "step": 1749, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:02:48.907009", "step": 1749, "epoch": 2 }, { "type": "loss", "content": 0.29463064670562744, "timestamp": "2025-09-05 09:02:48.909234", "step": 1750, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:49.117191", "step": 1750, "epoch": 2 }, { "type": "loss", "content": 0.3289870023727417, "timestamp": "2025-09-05 09:02:49.119123", "step": 1751, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:49.314893", "step": 1751, "epoch": 2 }, { "type": "loss", "content": 0.35243523120880127, "timestamp": "2025-09-05 09:02:49.329655", "step": 1752, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:49.654407", "step": 1752, "epoch": 2 }, { "type": "loss", "content": 0.39635375142097473, "timestamp": "2025-09-05 09:02:49.656285", "step": 1753, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:49.861685", "step": 1753, "epoch": 2 }, { "type": "loss", "content": 0.23487235605716705, "timestamp": "2025-09-05 09:02:49.864300", "step": 1754, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:50.061153", "step": 1754, "epoch": 2 }, { "type": "loss", "content": 0.3939005136489868, "timestamp": "2025-09-05 09:02:50.103946", "step": 1755, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:50.438452", "step": 1755, "epoch": 2 }, { "type": "loss", "content": 0.3778200149536133, "timestamp": "2025-09-05 09:02:50.448072", "step": 1756, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:50.613169", "step": 1756, "epoch": 2 }, { "type": "loss", "content": 0.2921392619609833, "timestamp": "2025-09-05 09:02:50.614850", "step": 1757, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:02:50.819770", "step": 1757, "epoch": 2 }, { "type": "loss", "content": 0.24706591665744781, "timestamp": "2025-09-05 09:02:50.821640", "step": 1758, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:02:51.019320", "step": 1758, "epoch": 2 }, { "type": "loss", "content": 0.3144387900829315, "timestamp": "2025-09-05 09:02:51.020986", "step": 1759, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:02:51.218943", "step": 1759, "epoch": 2 }, { "type": "loss", "content": 0.45034703612327576, "timestamp": "2025-09-05 09:02:51.233986", "step": 1760, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:02:56.821336", "step": 1760, "epoch": 2 }, { "type": "pplx", "content": 53.89968272891614, "timestamp": "2025-09-05 09:02:56.825297", "step": 1760, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1760", "timestamp": "2025-09-05 09:02:57.358500", "step": 1760, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:57.525650", "step": 1760, "epoch": 2 }, { "type": "loss", "content": 0.2909316122531891, "timestamp": "2025-09-05 09:02:57.527607", "step": 1761, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:02:57.732796", "step": 1761, "epoch": 2 }, { "type": "loss", "content": 0.372611939907074, "timestamp": "2025-09-05 09:02:57.734962", "step": 1762, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:57.982758", "step": 1762, "epoch": 2 }, { "type": "loss", "content": 0.3195144534111023, "timestamp": "2025-09-05 09:02:57.984997", "step": 1763, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:58.191741", "step": 1763, "epoch": 2 }, { "type": "loss", "content": 0.2733082175254822, "timestamp": "2025-09-05 09:02:58.206696", "step": 1764, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:58.394732", "step": 1764, "epoch": 2 }, { "type": "loss", "content": 0.34646353125572205, "timestamp": "2025-09-05 09:02:58.397168", "step": 1765, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:02:58.604597", "step": 1765, "epoch": 2 }, { "type": "loss", "content": 0.2818068265914917, "timestamp": "2025-09-05 09:02:58.666650", "step": 1766, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:58.946660", "step": 1766, "epoch": 2 }, { "type": "loss", "content": 0.20438231527805328, "timestamp": "2025-09-05 09:02:58.948538", "step": 1767, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:59.147170", "step": 1767, "epoch": 2 }, { "type": "loss", "content": 0.2941116690635681, "timestamp": "2025-09-05 09:02:59.157344", "step": 1768, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:02:59.318970", "step": 1768, "epoch": 2 }, { "type": "loss", "content": 0.387058287858963, "timestamp": "2025-09-05 09:02:59.341479", "step": 1769, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:59.590735", "step": 1769, "epoch": 2 }, { "type": "loss", "content": 0.5514186024665833, "timestamp": "2025-09-05 09:02:59.593040", "step": 1770, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:02:59.789424", "step": 1770, "epoch": 2 }, { "type": "loss", "content": 0.39721089601516724, "timestamp": "2025-09-05 09:02:59.791671", "step": 1771, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:00.085156", "step": 1771, "epoch": 2 }, { "type": "loss", "content": 0.4141196608543396, "timestamp": "2025-09-05 09:03:00.094213", "step": 1772, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:00.255930", "step": 1772, "epoch": 2 }, { "type": "loss", "content": 0.3068144917488098, "timestamp": "2025-09-05 09:03:00.257769", "step": 1773, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:00.423547", "step": 1773, "epoch": 2 }, { "type": "loss", "content": 0.2862834632396698, "timestamp": "2025-09-05 09:03:00.426909", "step": 1774, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:00.636780", "step": 1774, "epoch": 2 }, { "type": "loss", "content": 0.5047730207443237, "timestamp": "2025-09-05 09:03:00.639450", "step": 1775, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:00.890445", "step": 1775, "epoch": 2 }, { "type": "loss", "content": 0.3867059648036957, "timestamp": "2025-09-05 09:03:00.907081", "step": 1776, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:01.102574", "step": 1776, "epoch": 2 }, { "type": "loss", "content": 0.32131898403167725, "timestamp": "2025-09-05 09:03:01.104756", "step": 1777, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:01.301984", "step": 1777, "epoch": 2 }, { "type": "loss", "content": 0.26770636439323425, "timestamp": "2025-09-05 09:03:01.304878", "step": 1778, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:01.554677", "step": 1778, "epoch": 2 }, { "type": "loss", "content": 0.15419012308120728, "timestamp": "2025-09-05 09:03:01.556780", "step": 1779, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:01.752071", "step": 1779, "epoch": 2 }, { "type": "loss", "content": 0.4039864242076874, "timestamp": "2025-09-05 09:03:01.766230", "step": 1780, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:03:07.403589", "step": 1780, "epoch": 2 }, { "type": "pplx", "content": 54.678229296229425, "timestamp": "2025-09-05 09:03:07.406101", "step": 1780, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:07.567589", "step": 1780, "epoch": 2 }, { "type": "loss", "content": 0.33174604177474976, "timestamp": "2025-09-05 09:03:07.569717", "step": 1781, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:07.776051", "step": 1781, "epoch": 2 }, { "type": "loss", "content": 0.36827021837234497, "timestamp": "2025-09-05 09:03:07.778234", "step": 1782, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:07.984836", "step": 1782, "epoch": 2 }, { "type": "loss", "content": 0.23714663088321686, "timestamp": "2025-09-05 09:03:07.986846", "step": 1783, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:08.184589", "step": 1783, "epoch": 2 }, { "type": "loss", "content": 0.34843164682388306, "timestamp": "2025-09-05 09:03:08.198645", "step": 1784, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:08.387243", "step": 1784, "epoch": 2 }, { "type": "loss", "content": 0.33095985651016235, "timestamp": "2025-09-05 09:03:08.388990", "step": 1785, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:08.554852", "step": 1785, "epoch": 2 }, { "type": "loss", "content": 0.34028369188308716, "timestamp": "2025-09-05 09:03:08.559697", "step": 1786, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:08.766857", "step": 1786, "epoch": 2 }, { "type": "loss", "content": 0.25933557748794556, "timestamp": "2025-09-05 09:03:08.770226", "step": 1787, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:09.021745", "step": 1787, "epoch": 2 }, { "type": "loss", "content": 0.3989739716053009, "timestamp": "2025-09-05 09:03:09.035974", "step": 1788, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:09.224379", "step": 1788, "epoch": 2 }, { "type": "loss", "content": 0.38663944602012634, "timestamp": "2025-09-05 09:03:09.227234", "step": 1789, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:09.430888", "step": 1789, "epoch": 2 }, { "type": "loss", "content": 0.17637419700622559, "timestamp": "2025-09-05 09:03:09.432645", "step": 1790, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:09.628726", "step": 1790, "epoch": 2 }, { "type": "loss", "content": 0.2469310164451599, "timestamp": "2025-09-05 09:03:09.630565", "step": 1791, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:09.826584", "step": 1791, "epoch": 2 }, { "type": "loss", "content": 0.3839358687400818, "timestamp": "2025-09-05 09:03:09.840975", "step": 1792, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:10.071018", "step": 1792, "epoch": 2 }, { "type": "loss", "content": 0.36574140191078186, "timestamp": "2025-09-05 09:03:10.114734", "step": 1793, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:10.319121", "step": 1793, "epoch": 2 }, { "type": "loss", "content": 0.3396137058734894, "timestamp": "2025-09-05 09:03:10.321423", "step": 1794, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:10.528211", "step": 1794, "epoch": 2 }, { "type": "loss", "content": 0.3768261671066284, "timestamp": "2025-09-05 09:03:10.530196", "step": 1795, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:10.775691", "step": 1795, "epoch": 2 }, { "type": "loss", "content": 0.3471960723400116, "timestamp": "2025-09-05 09:03:10.791006", "step": 1796, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:10.980600", "step": 1796, "epoch": 2 }, { "type": "loss", "content": 0.4404922425746918, "timestamp": "2025-09-05 09:03:10.982368", "step": 1797, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:03:11.274310", "step": 1797, "epoch": 2 }, { "type": "loss", "content": 0.2796347141265869, "timestamp": "2025-09-05 09:03:11.276786", "step": 1798, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:11.474461", "step": 1798, "epoch": 2 }, { "type": "loss", "content": 0.3296290338039398, "timestamp": "2025-09-05 09:03:11.476700", "step": 1799, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:11.682812", "step": 1799, "epoch": 2 }, { "type": "loss", "content": 0.2389107197523117, "timestamp": "2025-09-05 09:03:11.699218", "step": 1800, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:03:16.674241", "step": 1800, "epoch": 2 }, { "type": "pplx", "content": 55.036339384505084, "timestamp": "2025-09-05 09:03:16.676009", "step": 1800, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1800", "timestamp": "2025-09-05 09:03:17.178926", "step": 1800, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:17.379411", "step": 1800, "epoch": 2 }, { "type": "loss", "content": 0.32822126150131226, "timestamp": "2025-09-05 09:03:17.382120", "step": 1801, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:17.578801", "step": 1801, "epoch": 2 }, { "type": "loss", "content": 0.2833464443683624, "timestamp": "2025-09-05 09:03:17.581895", "step": 1802, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:17.781822", "step": 1802, "epoch": 2 }, { "type": "loss", "content": 0.3650655448436737, "timestamp": "2025-09-05 09:03:17.784468", "step": 1803, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:18.050377", "step": 1803, "epoch": 2 }, { "type": "loss", "content": 0.3336227238178253, "timestamp": "2025-09-05 09:03:18.066982", "step": 1804, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:18.267529", "step": 1804, "epoch": 2 }, { "type": "loss", "content": 0.3089102804660797, "timestamp": "2025-09-05 09:03:18.270031", "step": 1805, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:18.468268", "step": 1805, "epoch": 2 }, { "type": "loss", "content": 0.2924884259700775, "timestamp": "2025-09-05 09:03:18.470148", "step": 1806, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:18.665728", "step": 1806, "epoch": 2 }, { "type": "loss", "content": 0.25991401076316833, "timestamp": "2025-09-05 09:03:18.667725", "step": 1807, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:18.872934", "step": 1807, "epoch": 2 }, { "type": "loss", "content": 0.38074877858161926, "timestamp": "2025-09-05 09:03:18.887103", "step": 1808, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:19.075301", "step": 1808, "epoch": 2 }, { "type": "loss", "content": 0.2662460505962372, "timestamp": "2025-09-05 09:03:19.077492", "step": 1809, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:19.283635", "step": 1809, "epoch": 2 }, { "type": "loss", "content": 0.3200852572917938, "timestamp": "2025-09-05 09:03:19.285866", "step": 1810, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:19.514373", "step": 1810, "epoch": 2 }, { "type": "loss", "content": 0.23158332705497742, "timestamp": "2025-09-05 09:03:19.516172", "step": 1811, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:19.714236", "step": 1811, "epoch": 2 }, { "type": "loss", "content": 0.38919728994369507, "timestamp": "2025-09-05 09:03:19.723741", "step": 1812, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:19.892125", "step": 1812, "epoch": 2 }, { "type": "loss", "content": 0.38500481843948364, "timestamp": "2025-09-05 09:03:19.894564", "step": 1813, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:20.062946", "step": 1813, "epoch": 2 }, { "type": "loss", "content": 0.4389552175998688, "timestamp": "2025-09-05 09:03:20.065351", "step": 1814, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:20.262580", "step": 1814, "epoch": 2 }, { "type": "loss", "content": 0.4390960931777954, "timestamp": "2025-09-05 09:03:20.265034", "step": 1815, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:20.432565", "step": 1815, "epoch": 2 }, { "type": "loss", "content": 0.3098478615283966, "timestamp": "2025-09-05 09:03:20.448133", "step": 1816, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:20.639359", "step": 1816, "epoch": 2 }, { "type": "loss", "content": 0.28565704822540283, "timestamp": "2025-09-05 09:03:20.641294", "step": 1817, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:20.850568", "step": 1817, "epoch": 2 }, { "type": "loss", "content": 0.47005581855773926, "timestamp": "2025-09-05 09:03:20.852541", "step": 1818, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:21.020394", "step": 1818, "epoch": 2 }, { "type": "loss", "content": 0.38893210887908936, "timestamp": "2025-09-05 09:03:21.022791", "step": 1819, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:21.219082", "step": 1819, "epoch": 2 }, { "type": "loss", "content": 0.26181450486183167, "timestamp": "2025-09-05 09:03:21.228380", "step": 1820, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:03:25.920801", "step": 1820, "epoch": 2 }, { "type": "pplx", "content": 55.33664731691632, "timestamp": "2025-09-05 09:03:25.923217", "step": 1820, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:26.088428", "step": 1820, "epoch": 2 }, { "type": "loss", "content": 0.4290368854999542, "timestamp": "2025-09-05 09:03:26.090971", "step": 1821, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:26.256546", "step": 1821, "epoch": 2 }, { "type": "loss", "content": 0.3403278887271881, "timestamp": "2025-09-05 09:03:26.272384", "step": 1822, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:26.478670", "step": 1822, "epoch": 2 }, { "type": "loss", "content": 0.35543927550315857, "timestamp": "2025-09-05 09:03:26.481017", "step": 1823, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:26.678647", "step": 1823, "epoch": 2 }, { "type": "loss", "content": 0.36923372745513916, "timestamp": "2025-09-05 09:03:26.696460", "step": 1824, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:26.894759", "step": 1824, "epoch": 2 }, { "type": "loss", "content": 0.33264076709747314, "timestamp": "2025-09-05 09:03:26.898813", "step": 1825, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:27.095284", "step": 1825, "epoch": 2 }, { "type": "loss", "content": 0.2916991710662842, "timestamp": "2025-09-05 09:03:27.097313", "step": 1826, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:27.294793", "step": 1826, "epoch": 2 }, { "type": "loss", "content": 0.339844286441803, "timestamp": "2025-09-05 09:03:27.296700", "step": 1827, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:27.492683", "step": 1827, "epoch": 2 }, { "type": "loss", "content": 0.25583401322364807, "timestamp": "2025-09-05 09:03:27.501965", "step": 1828, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:27.667250", "step": 1828, "epoch": 2 }, { "type": "loss", "content": 0.2994399070739746, "timestamp": "2025-09-05 09:03:27.669072", "step": 1829, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:27.875984", "step": 1829, "epoch": 2 }, { "type": "loss", "content": 0.2248479723930359, "timestamp": "2025-09-05 09:03:27.877941", "step": 1830, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:28.074462", "step": 1830, "epoch": 2 }, { "type": "loss", "content": 0.44062700867652893, "timestamp": "2025-09-05 09:03:28.076844", "step": 1831, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:28.284610", "step": 1831, "epoch": 2 }, { "type": "loss", "content": 0.34865128993988037, "timestamp": "2025-09-05 09:03:28.294083", "step": 1832, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:28.457941", "step": 1832, "epoch": 2 }, { "type": "loss", "content": 0.3390546441078186, "timestamp": "2025-09-05 09:03:28.459806", "step": 1833, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:28.625641", "step": 1833, "epoch": 2 }, { "type": "loss", "content": 0.4029175639152527, "timestamp": "2025-09-05 09:03:28.627560", "step": 1834, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:28.833445", "step": 1834, "epoch": 2 }, { "type": "loss", "content": 0.29427599906921387, "timestamp": "2025-09-05 09:03:28.835361", "step": 1835, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:29.042402", "step": 1835, "epoch": 2 }, { "type": "loss", "content": 0.36329180002212524, "timestamp": "2025-09-05 09:03:29.059551", "step": 1836, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:29.261899", "step": 1836, "epoch": 2 }, { "type": "loss", "content": 0.24370358884334564, "timestamp": "2025-09-05 09:03:29.264242", "step": 1837, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:03:29.435765", "step": 1837, "epoch": 2 }, { "type": "loss", "content": 0.4641832411289215, "timestamp": "2025-09-05 09:03:29.438505", "step": 1838, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:29.645590", "step": 1838, "epoch": 2 }, { "type": "loss", "content": 0.27168017625808716, "timestamp": "2025-09-05 09:03:29.648925", "step": 1839, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:29.898200", "step": 1839, "epoch": 2 }, { "type": "loss", "content": 0.2619711458683014, "timestamp": "2025-09-05 09:03:29.964385", "step": 1840, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:03:34.650912", "step": 1840, "epoch": 2 }, { "type": "pplx", "content": 55.72341477187909, "timestamp": "2025-09-05 09:03:34.653569", "step": 1840, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1840", "timestamp": "2025-09-05 09:03:35.179728", "step": 1840, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:35.347722", "step": 1840, "epoch": 2 }, { "type": "loss", "content": 0.3577501177787781, "timestamp": "2025-09-05 09:03:35.349827", "step": 1841, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:35.556211", "step": 1841, "epoch": 2 }, { "type": "loss", "content": 0.3645785450935364, "timestamp": "2025-09-05 09:03:35.558862", "step": 1842, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:35.726141", "step": 1842, "epoch": 2 }, { "type": "loss", "content": 0.356271892786026, "timestamp": "2025-09-05 09:03:35.728901", "step": 1843, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:35.932810", "step": 1843, "epoch": 2 }, { "type": "loss", "content": 0.2203180342912674, "timestamp": "2025-09-05 09:03:35.942768", "step": 1844, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:36.107321", "step": 1844, "epoch": 2 }, { "type": "loss", "content": 0.4093642234802246, "timestamp": "2025-09-05 09:03:36.109424", "step": 1845, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:36.277831", "step": 1845, "epoch": 2 }, { "type": "loss", "content": 0.2778435945510864, "timestamp": "2025-09-05 09:03:36.280627", "step": 1846, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:36.449396", "step": 1846, "epoch": 2 }, { "type": "loss", "content": 0.3252573013305664, "timestamp": "2025-09-05 09:03:36.452403", "step": 1847, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:03:36.659098", "step": 1847, "epoch": 2 }, { "type": "loss", "content": 0.41045743227005005, "timestamp": "2025-09-05 09:03:36.668901", "step": 1848, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:36.833074", "step": 1848, "epoch": 2 }, { "type": "loss", "content": 0.38269633054733276, "timestamp": "2025-09-05 09:03:36.836355", "step": 1849, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:37.005419", "step": 1849, "epoch": 2 }, { "type": "loss", "content": 0.46076807379722595, "timestamp": "2025-09-05 09:03:37.007421", "step": 1850, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:37.174049", "step": 1850, "epoch": 2 }, { "type": "loss", "content": 0.40671306848526, "timestamp": "2025-09-05 09:03:37.176204", "step": 1851, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:37.373674", "step": 1851, "epoch": 2 }, { "type": "loss", "content": 0.19898702204227448, "timestamp": "2025-09-05 09:03:37.384578", "step": 1852, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:37.547587", "step": 1852, "epoch": 2 }, { "type": "loss", "content": 0.21051372587680817, "timestamp": "2025-09-05 09:03:37.549476", "step": 1853, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:03:37.758174", "step": 1853, "epoch": 2 }, { "type": "loss", "content": 0.17444637417793274, "timestamp": "2025-09-05 09:03:37.760015", "step": 1854, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:38.011527", "step": 1854, "epoch": 2 }, { "type": "loss", "content": 0.25767529010772705, "timestamp": "2025-09-05 09:03:38.013700", "step": 1855, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:38.211086", "step": 1855, "epoch": 2 }, { "type": "loss", "content": 0.283276230096817, "timestamp": "2025-09-05 09:03:38.220740", "step": 1856, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:38.384710", "step": 1856, "epoch": 2 }, { "type": "loss", "content": 0.36334753036499023, "timestamp": "2025-09-05 09:03:38.387474", "step": 1857, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:38.555920", "step": 1857, "epoch": 2 }, { "type": "loss", "content": 0.24148380756378174, "timestamp": "2025-09-05 09:03:38.558779", "step": 1858, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:03:38.758088", "step": 1858, "epoch": 2 }, { "type": "loss", "content": 0.2839478552341461, "timestamp": "2025-09-05 09:03:38.760496", "step": 1859, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:38.956342", "step": 1859, "epoch": 2 }, { "type": "loss", "content": 0.2954237163066864, "timestamp": "2025-09-05 09:03:38.973295", "step": 1860, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:03:43.752815", "step": 1860, "epoch": 2 }, { "type": "pplx", "content": 55.779841573659716, "timestamp": "2025-09-05 09:03:43.755396", "step": 1860, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:43.917975", "step": 1860, "epoch": 2 }, { "type": "loss", "content": 0.25287219882011414, "timestamp": "2025-09-05 09:03:43.920185", "step": 1861, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:44.125292", "step": 1861, "epoch": 2 }, { "type": "loss", "content": 0.36329612135887146, "timestamp": "2025-09-05 09:03:44.127298", "step": 1862, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:44.334342", "step": 1862, "epoch": 2 }, { "type": "loss", "content": 0.214580699801445, "timestamp": "2025-09-05 09:03:44.336143", "step": 1863, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:44.543600", "step": 1863, "epoch": 2 }, { "type": "loss", "content": 0.32577669620513916, "timestamp": "2025-09-05 09:03:44.560339", "step": 1864, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:44.759134", "step": 1864, "epoch": 2 }, { "type": "loss", "content": 0.45734408497810364, "timestamp": "2025-09-05 09:03:44.761720", "step": 1865, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:44.967935", "step": 1865, "epoch": 2 }, { "type": "loss", "content": 0.3126198947429657, "timestamp": "2025-09-05 09:03:44.970550", "step": 1866, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:45.140680", "step": 1866, "epoch": 2 }, { "type": "loss", "content": 0.2847732603549957, "timestamp": "2025-09-05 09:03:45.142554", "step": 1867, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:45.349745", "step": 1867, "epoch": 2 }, { "type": "loss", "content": 0.25974026322364807, "timestamp": "2025-09-05 09:03:45.359453", "step": 1868, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:45.559114", "step": 1868, "epoch": 2 }, { "type": "loss", "content": 0.3027217388153076, "timestamp": "2025-09-05 09:03:45.561007", "step": 1869, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:45.767401", "step": 1869, "epoch": 2 }, { "type": "loss", "content": 0.2498902678489685, "timestamp": "2025-09-05 09:03:45.769305", "step": 1870, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:45.966695", "step": 1870, "epoch": 2 }, { "type": "loss", "content": 0.3127624988555908, "timestamp": "2025-09-05 09:03:45.969508", "step": 1871, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:46.181681", "step": 1871, "epoch": 2 }, { "type": "loss", "content": 0.298271507024765, "timestamp": "2025-09-05 09:03:46.199839", "step": 1872, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:46.481768", "step": 1872, "epoch": 2 }, { "type": "loss", "content": 0.31834274530410767, "timestamp": "2025-09-05 09:03:46.483757", "step": 1873, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:46.783137", "step": 1873, "epoch": 2 }, { "type": "loss", "content": 0.264957994222641, "timestamp": "2025-09-05 09:03:46.799236", "step": 1874, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:47.018791", "step": 1874, "epoch": 2 }, { "type": "loss", "content": 0.43002596497535706, "timestamp": "2025-09-05 09:03:47.020698", "step": 1875, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:47.227210", "step": 1875, "epoch": 2 }, { "type": "loss", "content": 0.4498365819454193, "timestamp": "2025-09-05 09:03:47.242075", "step": 1876, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:47.496978", "step": 1876, "epoch": 2 }, { "type": "loss", "content": 0.21851421892642975, "timestamp": "2025-09-05 09:03:47.498804", "step": 1877, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:47.665836", "step": 1877, "epoch": 2 }, { "type": "loss", "content": 0.3184349238872528, "timestamp": "2025-09-05 09:03:47.667876", "step": 1878, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:47.871856", "step": 1878, "epoch": 2 }, { "type": "loss", "content": 0.33598777651786804, "timestamp": "2025-09-05 09:03:47.873933", "step": 1879, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:48.041864", "step": 1879, "epoch": 2 }, { "type": "loss", "content": 0.31021371483802795, "timestamp": "2025-09-05 09:03:48.058333", "step": 1880, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:03:52.719413", "step": 1880, "epoch": 2 }, { "type": "pplx", "content": 55.9173870399914, "timestamp": "2025-09-05 09:03:52.721544", "step": 1880, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1880", "timestamp": "2025-09-05 09:03:53.184102", "step": 1880, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:53.347637", "step": 1880, "epoch": 2 }, { "type": "loss", "content": 0.35257551074028015, "timestamp": "2025-09-05 09:03:53.349442", "step": 1881, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:53.516039", "step": 1881, "epoch": 2 }, { "type": "loss", "content": 0.41870570182800293, "timestamp": "2025-09-05 09:03:53.518151", "step": 1882, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:53.685692", "step": 1882, "epoch": 2 }, { "type": "loss", "content": 0.33617204427719116, "timestamp": "2025-09-05 09:03:53.688722", "step": 1883, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:53.883855", "step": 1883, "epoch": 2 }, { "type": "loss", "content": 0.4209844172000885, "timestamp": "2025-09-05 09:03:53.899544", "step": 1884, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:54.089537", "step": 1884, "epoch": 2 }, { "type": "loss", "content": 0.27658504247665405, "timestamp": "2025-09-05 09:03:54.097167", "step": 1885, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:54.432684", "step": 1885, "epoch": 2 }, { "type": "loss", "content": 0.30306634306907654, "timestamp": "2025-09-05 09:03:54.434610", "step": 1886, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:54.601164", "step": 1886, "epoch": 2 }, { "type": "loss", "content": 0.3418419361114502, "timestamp": "2025-09-05 09:03:54.603665", "step": 1887, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:54.806396", "step": 1887, "epoch": 2 }, { "type": "loss", "content": 0.24061523377895355, "timestamp": "2025-09-05 09:03:54.822115", "step": 1888, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:55.010753", "step": 1888, "epoch": 2 }, { "type": "loss", "content": 0.3230704665184021, "timestamp": "2025-09-05 09:03:55.013738", "step": 1889, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:55.218951", "step": 1889, "epoch": 2 }, { "type": "loss", "content": 0.28835007548332214, "timestamp": "2025-09-05 09:03:55.221346", "step": 1890, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:55.387695", "step": 1890, "epoch": 2 }, { "type": "loss", "content": 0.41431716084480286, "timestamp": "2025-09-05 09:03:55.390377", "step": 1891, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:03:55.586573", "step": 1891, "epoch": 2 }, { "type": "loss", "content": 0.19797471165657043, "timestamp": "2025-09-05 09:03:55.597127", "step": 1892, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:55.759697", "step": 1892, "epoch": 2 }, { "type": "loss", "content": 0.3431963324546814, "timestamp": "2025-09-05 09:03:55.762708", "step": 1893, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:55.971229", "step": 1893, "epoch": 2 }, { "type": "loss", "content": 0.3090471923351288, "timestamp": "2025-09-05 09:03:55.973593", "step": 1894, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:03:56.142593", "step": 1894, "epoch": 2 }, { "type": "loss", "content": 0.2430524230003357, "timestamp": "2025-09-05 09:03:56.145717", "step": 1895, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:56.352947", "step": 1895, "epoch": 2 }, { "type": "loss", "content": 0.23343847692012787, "timestamp": "2025-09-05 09:03:56.365935", "step": 1896, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:56.529315", "step": 1896, "epoch": 2 }, { "type": "loss", "content": 0.3793867826461792, "timestamp": "2025-09-05 09:03:56.531878", "step": 1897, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:03:56.698816", "step": 1897, "epoch": 2 }, { "type": "loss", "content": 0.37744349241256714, "timestamp": "2025-09-05 09:03:56.701645", "step": 1898, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:03:56.871278", "step": 1898, "epoch": 2 }, { "type": "loss", "content": 0.22595612704753876, "timestamp": "2025-09-05 09:03:56.874840", "step": 1899, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:03:57.081380", "step": 1899, "epoch": 2 }, { "type": "loss", "content": 0.25911441445350647, "timestamp": "2025-09-05 09:03:57.096490", "step": 1900, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:04:01.883995", "step": 1900, "epoch": 2 }, { "type": "pplx", "content": 55.85029067024223, "timestamp": "2025-09-05 09:04:01.886021", "step": 1900, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:04:02.049119", "step": 1900, "epoch": 2 }, { "type": "loss", "content": 0.31988221406936646, "timestamp": "2025-09-05 09:04:02.050787", "step": 1901, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:02.256430", "step": 1901, "epoch": 2 }, { "type": "loss", "content": 0.2469642162322998, "timestamp": "2025-09-05 09:04:02.258336", "step": 1902, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:02.428352", "step": 1902, "epoch": 2 }, { "type": "loss", "content": 0.35464945435523987, "timestamp": "2025-09-05 09:04:02.430549", "step": 1903, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:02.629860", "step": 1903, "epoch": 2 }, { "type": "loss", "content": 0.4780825972557068, "timestamp": "2025-09-05 09:04:02.640245", "step": 1904, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:04:02.801940", "step": 1904, "epoch": 2 }, { "type": "loss", "content": 0.2757084369659424, "timestamp": "2025-09-05 09:04:02.803938", "step": 1905, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:02.972141", "step": 1905, "epoch": 2 }, { "type": "loss", "content": 0.1613525003194809, "timestamp": "2025-09-05 09:04:02.974528", "step": 1906, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:03.181297", "step": 1906, "epoch": 2 }, { "type": "loss", "content": 0.36126402020454407, "timestamp": "2025-09-05 09:04:03.183198", "step": 1907, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:03.355002", "step": 1907, "epoch": 2 }, { "type": "loss", "content": 0.49422934651374817, "timestamp": "2025-09-05 09:04:03.370219", "step": 1908, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:03.563094", "step": 1908, "epoch": 2 }, { "type": "loss", "content": 0.40730562806129456, "timestamp": "2025-09-05 09:04:03.564993", "step": 1909, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:03.733709", "step": 1909, "epoch": 2 }, { "type": "loss", "content": 0.3607218265533447, "timestamp": "2025-09-05 09:04:03.736360", "step": 1910, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:03.931318", "step": 1910, "epoch": 2 }, { "type": "loss", "content": 0.3690507113933563, "timestamp": "2025-09-05 09:04:03.933114", "step": 1911, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:04:04.104801", "step": 1911, "epoch": 2 }, { "type": "loss", "content": 0.2338135987520218, "timestamp": "2025-09-05 09:04:04.114485", "step": 1912, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:04.278006", "step": 1912, "epoch": 2 }, { "type": "loss", "content": 0.46766653656959534, "timestamp": "2025-09-05 09:04:04.279765", "step": 1913, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:04.449674", "step": 1913, "epoch": 2 }, { "type": "loss", "content": 0.36028924584388733, "timestamp": "2025-09-05 09:04:04.451446", "step": 1914, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:04.618848", "step": 1914, "epoch": 2 }, { "type": "loss", "content": 0.4140165448188782, "timestamp": "2025-09-05 09:04:04.621294", "step": 1915, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:04.819558", "step": 1915, "epoch": 2 }, { "type": "loss", "content": 0.2273964136838913, "timestamp": "2025-09-05 09:04:04.828828", "step": 1916, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:04.994269", "step": 1916, "epoch": 2 }, { "type": "loss", "content": 0.3443259596824646, "timestamp": "2025-09-05 09:04:04.997681", "step": 1917, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:05.165754", "step": 1917, "epoch": 2 }, { "type": "loss", "content": 0.30746108293533325, "timestamp": "2025-09-05 09:04:05.167548", "step": 1918, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:05.335215", "step": 1918, "epoch": 2 }, { "type": "loss", "content": 0.2145860493183136, "timestamp": "2025-09-05 09:04:05.338280", "step": 1919, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:05.544231", "step": 1919, "epoch": 2 }, { "type": "loss", "content": 0.3346792459487915, "timestamp": "2025-09-05 09:04:05.558510", "step": 1920, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:04:10.351935", "step": 1920, "epoch": 2 }, { "type": "pplx", "content": 56.15066865394947, "timestamp": "2025-09-05 09:04:10.353809", "step": 1920, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1920", "timestamp": "2025-09-05 09:04:10.783963", "step": 1920, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:10.952777", "step": 1920, "epoch": 2 }, { "type": "loss", "content": 0.34421074390411377, "timestamp": "2025-09-05 09:04:10.954701", "step": 1921, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:11.123289", "step": 1921, "epoch": 2 }, { "type": "loss", "content": 0.27249059081077576, "timestamp": "2025-09-05 09:04:11.125101", "step": 1922, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:11.292348", "step": 1922, "epoch": 2 }, { "type": "loss", "content": 0.2681417465209961, "timestamp": "2025-09-05 09:04:11.295915", "step": 1923, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:04:11.506679", "step": 1923, "epoch": 2 }, { "type": "loss", "content": 0.4062962234020233, "timestamp": "2025-09-05 09:04:11.516326", "step": 1924, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:11.678786", "step": 1924, "epoch": 2 }, { "type": "loss", "content": 0.3572784960269928, "timestamp": "2025-09-05 09:04:11.680783", "step": 1925, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:04:11.848575", "step": 1925, "epoch": 2 }, { "type": "loss", "content": 0.3298678994178772, "timestamp": "2025-09-05 09:04:11.850534", "step": 1926, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:12.045465", "step": 1926, "epoch": 2 }, { "type": "loss", "content": 0.17969104647636414, "timestamp": "2025-09-05 09:04:12.047081", "step": 1927, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:12.213555", "step": 1927, "epoch": 2 }, { "type": "loss", "content": 0.27280840277671814, "timestamp": "2025-09-05 09:04:12.227905", "step": 1928, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:12.417495", "step": 1928, "epoch": 2 }, { "type": "loss", "content": 0.3779914379119873, "timestamp": "2025-09-05 09:04:12.420129", "step": 1929, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:12.587001", "step": 1929, "epoch": 2 }, { "type": "loss", "content": 0.32336172461509705, "timestamp": "2025-09-05 09:04:12.589142", "step": 1930, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:12.796003", "step": 1930, "epoch": 2 }, { "type": "loss", "content": 0.5328038930892944, "timestamp": "2025-09-05 09:04:12.802014", "step": 1931, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:13.012545", "step": 1931, "epoch": 2 }, { "type": "loss", "content": 0.2591715157032013, "timestamp": "2025-09-05 09:04:13.022339", "step": 1932, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:13.186085", "step": 1932, "epoch": 2 }, { "type": "loss", "content": 0.39072778820991516, "timestamp": "2025-09-05 09:04:13.188276", "step": 1933, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:13.357480", "step": 1933, "epoch": 2 }, { "type": "loss", "content": 0.20760498940944672, "timestamp": "2025-09-05 09:04:13.359460", "step": 1934, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:13.528291", "step": 1934, "epoch": 2 }, { "type": "loss", "content": 0.2971227467060089, "timestamp": "2025-09-05 09:04:13.530536", "step": 1935, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:13.728107", "step": 1935, "epoch": 2 }, { "type": "loss", "content": 0.3519618809223175, "timestamp": "2025-09-05 09:04:13.743615", "step": 1936, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:13.935962", "step": 1936, "epoch": 2 }, { "type": "loss", "content": 0.4732755720615387, "timestamp": "2025-09-05 09:04:13.937754", "step": 1937, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:14.144175", "step": 1937, "epoch": 2 }, { "type": "loss", "content": 0.22129914164543152, "timestamp": "2025-09-05 09:04:14.146537", "step": 1938, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:14.315053", "step": 1938, "epoch": 2 }, { "type": "loss", "content": 0.27725905179977417, "timestamp": "2025-09-05 09:04:14.317471", "step": 1939, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:14.483480", "step": 1939, "epoch": 2 }, { "type": "loss", "content": 0.30071911215782166, "timestamp": "2025-09-05 09:04:14.498059", "step": 1940, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:04:19.889789", "step": 1940, "epoch": 2 }, { "type": "pplx", "content": 56.40086004465284, "timestamp": "2025-09-05 09:04:19.897148", "step": 1940, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:20.060950", "step": 1940, "epoch": 2 }, { "type": "loss", "content": 0.47121545672416687, "timestamp": "2025-09-05 09:04:20.063252", "step": 1941, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:20.230989", "step": 1941, "epoch": 2 }, { "type": "loss", "content": 0.2984376847743988, "timestamp": "2025-09-05 09:04:20.233513", "step": 1942, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:20.440935", "step": 1942, "epoch": 2 }, { "type": "loss", "content": 0.2947276532649994, "timestamp": "2025-09-05 09:04:20.443535", "step": 1943, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:20.611995", "step": 1943, "epoch": 2 }, { "type": "loss", "content": 0.3717845380306244, "timestamp": "2025-09-05 09:04:20.628113", "step": 1944, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:20.816134", "step": 1944, "epoch": 2 }, { "type": "loss", "content": 0.27086225152015686, "timestamp": "2025-09-05 09:04:20.818437", "step": 1945, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:20.986429", "step": 1945, "epoch": 2 }, { "type": "loss", "content": 0.36322546005249023, "timestamp": "2025-09-05 09:04:20.989100", "step": 1946, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:21.186891", "step": 1946, "epoch": 2 }, { "type": "loss", "content": 0.3279218077659607, "timestamp": "2025-09-05 09:04:21.188653", "step": 1947, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:21.356516", "step": 1947, "epoch": 2 }, { "type": "loss", "content": 0.3289004862308502, "timestamp": "2025-09-05 09:04:21.371172", "step": 1948, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:21.559646", "step": 1948, "epoch": 2 }, { "type": "loss", "content": 0.23576058447360992, "timestamp": "2025-09-05 09:04:21.561647", "step": 1949, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:21.767476", "step": 1949, "epoch": 2 }, { "type": "loss", "content": 0.4314030408859253, "timestamp": "2025-09-05 09:04:21.769580", "step": 1950, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:21.975210", "step": 1950, "epoch": 2 }, { "type": "loss", "content": 0.2590286135673523, "timestamp": "2025-09-05 09:04:21.977268", "step": 1951, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:22.143053", "step": 1951, "epoch": 2 }, { "type": "loss", "content": 0.3938618004322052, "timestamp": "2025-09-05 09:04:22.158294", "step": 1952, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:04:22.380932", "step": 1952, "epoch": 2 }, { "type": "loss", "content": 0.41767221689224243, "timestamp": "2025-09-05 09:04:22.383122", "step": 1953, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:22.602652", "step": 1953, "epoch": 2 }, { "type": "loss", "content": 0.25870412588119507, "timestamp": "2025-09-05 09:04:22.605208", "step": 1954, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:22.771168", "step": 1954, "epoch": 2 }, { "type": "loss", "content": 0.37756073474884033, "timestamp": "2025-09-05 09:04:22.774225", "step": 1955, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:22.982874", "step": 1955, "epoch": 2 }, { "type": "loss", "content": 0.3379029631614685, "timestamp": "2025-09-05 09:04:22.993241", "step": 1956, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:23.157972", "step": 1956, "epoch": 2 }, { "type": "loss", "content": 0.26806700229644775, "timestamp": "2025-09-05 09:04:23.160348", "step": 1957, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:23.326717", "step": 1957, "epoch": 2 }, { "type": "loss", "content": 0.2842908203601837, "timestamp": "2025-09-05 09:04:23.329832", "step": 1958, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:23.497903", "step": 1958, "epoch": 2 }, { "type": "loss", "content": 0.45978856086730957, "timestamp": "2025-09-05 09:04:23.500376", "step": 1959, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:23.698055", "step": 1959, "epoch": 2 }, { "type": "loss", "content": 0.23071345686912537, "timestamp": "2025-09-05 09:04:23.708192", "step": 1960, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:04:28.419098", "step": 1960, "epoch": 2 }, { "type": "pplx", "content": 56.53060956108376, "timestamp": "2025-09-05 09:04:28.421195", "step": 1960, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1960", "timestamp": "2025-09-05 09:04:28.888174", "step": 1960, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:29.072136", "step": 1960, "epoch": 2 }, { "type": "loss", "content": 0.2766428589820862, "timestamp": "2025-09-05 09:04:29.074140", "step": 1961, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:29.275781", "step": 1961, "epoch": 2 }, { "type": "loss", "content": 0.4845949113368988, "timestamp": "2025-09-05 09:04:29.277972", "step": 1962, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:29.473812", "step": 1962, "epoch": 2 }, { "type": "loss", "content": 0.28940051794052124, "timestamp": "2025-09-05 09:04:29.475571", "step": 1963, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:29.642041", "step": 1963, "epoch": 2 }, { "type": "loss", "content": 0.3159952163696289, "timestamp": "2025-09-05 09:04:29.659688", "step": 1964, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:04:29.853746", "step": 1964, "epoch": 2 }, { "type": "loss", "content": 0.3902105391025543, "timestamp": "2025-09-05 09:04:29.855230", "step": 1965, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:30.058508", "step": 1965, "epoch": 2 }, { "type": "loss", "content": 0.2579273283481598, "timestamp": "2025-09-05 09:04:30.060888", "step": 1966, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:30.260907", "step": 1966, "epoch": 2 }, { "type": "loss", "content": 0.28875139355659485, "timestamp": "2025-09-05 09:04:30.263070", "step": 1967, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:30.530524", "step": 1967, "epoch": 2 }, { "type": "loss", "content": 0.3324761390686035, "timestamp": "2025-09-05 09:04:30.540760", "step": 1968, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:30.707202", "step": 1968, "epoch": 2 }, { "type": "loss", "content": 0.3139239251613617, "timestamp": "2025-09-05 09:04:30.709065", "step": 1969, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:30.877108", "step": 1969, "epoch": 2 }, { "type": "loss", "content": 0.2467249482870102, "timestamp": "2025-09-05 09:04:30.879338", "step": 1970, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:31.085452", "step": 1970, "epoch": 2 }, { "type": "loss", "content": 0.27524515986442566, "timestamp": "2025-09-05 09:04:31.087634", "step": 1971, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:31.294610", "step": 1971, "epoch": 2 }, { "type": "loss", "content": 0.2604565918445587, "timestamp": "2025-09-05 09:04:31.304908", "step": 1972, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:31.468728", "step": 1972, "epoch": 2 }, { "type": "loss", "content": 0.3006044030189514, "timestamp": "2025-09-05 09:04:31.470507", "step": 1973, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:31.656392", "step": 1973, "epoch": 2 }, { "type": "loss", "content": 0.3287713825702667, "timestamp": "2025-09-05 09:04:31.658541", "step": 1974, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:31.857546", "step": 1974, "epoch": 2 }, { "type": "loss", "content": 0.3500474989414215, "timestamp": "2025-09-05 09:04:31.860148", "step": 1975, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:32.066595", "step": 1975, "epoch": 2 }, { "type": "loss", "content": 0.24871431291103363, "timestamp": "2025-09-05 09:04:32.081341", "step": 1976, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:32.271920", "step": 1976, "epoch": 2 }, { "type": "loss", "content": 0.3919868469238281, "timestamp": "2025-09-05 09:04:32.273912", "step": 1977, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:32.440681", "step": 1977, "epoch": 2 }, { "type": "loss", "content": 0.2948307693004608, "timestamp": "2025-09-05 09:04:32.442952", "step": 1978, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:32.648995", "step": 1978, "epoch": 2 }, { "type": "loss", "content": 0.4514521062374115, "timestamp": "2025-09-05 09:04:32.651170", "step": 1979, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:04:32.856832", "step": 1979, "epoch": 2 }, { "type": "loss", "content": 0.3044537305831909, "timestamp": "2025-09-05 09:04:32.871644", "step": 1980, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:04:37.538105", "step": 1980, "epoch": 2 }, { "type": "pplx", "content": 56.71036435354335, "timestamp": "2025-09-05 09:04:37.539835", "step": 1980, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:37.703241", "step": 1980, "epoch": 2 }, { "type": "loss", "content": 0.29383978247642517, "timestamp": "2025-09-05 09:04:37.705202", "step": 1981, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:37.873478", "step": 1981, "epoch": 2 }, { "type": "loss", "content": 0.4132064878940582, "timestamp": "2025-09-05 09:04:37.876394", "step": 1982, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:38.043828", "step": 1982, "epoch": 2 }, { "type": "loss", "content": 0.1803075671195984, "timestamp": "2025-09-05 09:04:38.046427", "step": 1983, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:38.241865", "step": 1983, "epoch": 2 }, { "type": "loss", "content": 0.411940336227417, "timestamp": "2025-09-05 09:04:38.257614", "step": 1984, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:04:38.446287", "step": 1984, "epoch": 2 }, { "type": "loss", "content": 0.3590596616268158, "timestamp": "2025-09-05 09:04:38.448991", "step": 1985, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:38.645695", "step": 1985, "epoch": 2 }, { "type": "loss", "content": 0.28954413533210754, "timestamp": "2025-09-05 09:04:38.647436", "step": 1986, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:04:38.812350", "step": 1986, "epoch": 2 }, { "type": "loss", "content": 0.25408488512039185, "timestamp": "2025-09-05 09:04:38.814460", "step": 1987, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:39.009224", "step": 1987, "epoch": 2 }, { "type": "loss", "content": 0.18480801582336426, "timestamp": "2025-09-05 09:04:39.018433", "step": 1988, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:39.182413", "step": 1988, "epoch": 2 }, { "type": "loss", "content": 0.305034339427948, "timestamp": "2025-09-05 09:04:39.184267", "step": 1989, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:39.390607", "step": 1989, "epoch": 2 }, { "type": "loss", "content": 0.23244498670101166, "timestamp": "2025-09-05 09:04:39.392494", "step": 1990, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:39.598816", "step": 1990, "epoch": 2 }, { "type": "loss", "content": 0.40569478273391724, "timestamp": "2025-09-05 09:04:39.600817", "step": 1991, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:39.768888", "step": 1991, "epoch": 2 }, { "type": "loss", "content": 0.3343004882335663, "timestamp": "2025-09-05 09:04:39.784051", "step": 1992, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:39.972490", "step": 1992, "epoch": 2 }, { "type": "loss", "content": 0.2612588405609131, "timestamp": "2025-09-05 09:04:39.974222", "step": 1993, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:04:40.142246", "step": 1993, "epoch": 2 }, { "type": "loss", "content": 0.3457273542881012, "timestamp": "2025-09-05 09:04:40.144612", "step": 1994, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:40.311740", "step": 1994, "epoch": 2 }, { "type": "loss", "content": 0.32286980748176575, "timestamp": "2025-09-05 09:04:40.314822", "step": 1995, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:40.521126", "step": 1995, "epoch": 2 }, { "type": "loss", "content": 0.32212406396865845, "timestamp": "2025-09-05 09:04:40.530489", "step": 1996, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:40.694811", "step": 1996, "epoch": 2 }, { "type": "loss", "content": 0.35238468647003174, "timestamp": "2025-09-05 09:04:40.697027", "step": 1997, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:40.864697", "step": 1997, "epoch": 2 }, { "type": "loss", "content": 0.3432472050189972, "timestamp": "2025-09-05 09:04:40.866865", "step": 1998, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:41.064125", "step": 1998, "epoch": 2 }, { "type": "loss", "content": 0.30503416061401367, "timestamp": "2025-09-05 09:04:41.066609", "step": 1999, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:41.234458", "step": 1999, "epoch": 2 }, { "type": "loss", "content": 0.26616600155830383, "timestamp": "2025-09-05 09:04:41.248958", "step": 2000, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:04:46.035115", "step": 2000, "epoch": 2 }, { "type": "pplx", "content": 57.183245029940046, "timestamp": "2025-09-05 09:04:46.039931", "step": 2000, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2000", "timestamp": "2025-09-05 09:04:46.512943", "step": 2000, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:46.680863", "step": 2000, "epoch": 2 }, { "type": "loss", "content": 0.27458614110946655, "timestamp": "2025-09-05 09:04:46.683153", "step": 2001, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:46.878428", "step": 2001, "epoch": 2 }, { "type": "loss", "content": 0.3308562934398651, "timestamp": "2025-09-05 09:04:46.880173", "step": 2002, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:47.085991", "step": 2002, "epoch": 2 }, { "type": "loss", "content": 0.4302447736263275, "timestamp": "2025-09-05 09:04:47.089966", "step": 2003, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:47.289790", "step": 2003, "epoch": 2 }, { "type": "loss", "content": 0.23679831624031067, "timestamp": "2025-09-05 09:04:47.299469", "step": 2004, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:47.462911", "step": 2004, "epoch": 2 }, { "type": "loss", "content": 0.3112923204898834, "timestamp": "2025-09-05 09:04:47.465129", "step": 2005, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:47.632457", "step": 2005, "epoch": 2 }, { "type": "loss", "content": 0.30558183789253235, "timestamp": "2025-09-05 09:04:47.634451", "step": 2006, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:47.839869", "step": 2006, "epoch": 2 }, { "type": "loss", "content": 0.3617997467517853, "timestamp": "2025-09-05 09:04:47.842181", "step": 2007, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:48.008505", "step": 2007, "epoch": 2 }, { "type": "loss", "content": 0.23063942790031433, "timestamp": "2025-09-05 09:04:48.025193", "step": 2008, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:48.221927", "step": 2008, "epoch": 2 }, { "type": "loss", "content": 0.36768314242362976, "timestamp": "2025-09-05 09:04:48.224032", "step": 2009, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:48.391869", "step": 2009, "epoch": 2 }, { "type": "loss", "content": 0.3346289396286011, "timestamp": "2025-09-05 09:04:48.393707", "step": 2010, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:04:48.590531", "step": 2010, "epoch": 2 }, { "type": "loss", "content": 0.3734259605407715, "timestamp": "2025-09-05 09:04:48.595428", "step": 2011, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:48.810951", "step": 2011, "epoch": 2 }, { "type": "loss", "content": 0.3368525505065918, "timestamp": "2025-09-05 09:04:48.825300", "step": 2012, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:49.013722", "step": 2012, "epoch": 2 }, { "type": "loss", "content": 0.3550032377243042, "timestamp": "2025-09-05 09:04:49.015412", "step": 2013, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:49.220262", "step": 2013, "epoch": 2 }, { "type": "loss", "content": 0.42761796712875366, "timestamp": "2025-09-05 09:04:49.222077", "step": 2014, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:49.428047", "step": 2014, "epoch": 2 }, { "type": "loss", "content": 0.23654963076114655, "timestamp": "2025-09-05 09:04:49.430134", "step": 2015, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:49.596867", "step": 2015, "epoch": 2 }, { "type": "loss", "content": 0.2988823354244232, "timestamp": "2025-09-05 09:04:49.611145", "step": 2016, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:49.799003", "step": 2016, "epoch": 2 }, { "type": "loss", "content": 0.265523761510849, "timestamp": "2025-09-05 09:04:49.800983", "step": 2017, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:50.008530", "step": 2017, "epoch": 2 }, { "type": "loss", "content": 0.4000529944896698, "timestamp": "2025-09-05 09:04:50.010746", "step": 2018, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:50.178544", "step": 2018, "epoch": 2 }, { "type": "loss", "content": 0.24551428854465485, "timestamp": "2025-09-05 09:04:50.180673", "step": 2019, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:50.380479", "step": 2019, "epoch": 2 }, { "type": "loss", "content": 0.15637749433517456, "timestamp": "2025-09-05 09:04:50.390664", "step": 2020, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:04:55.197240", "step": 2020, "epoch": 2 }, { "type": "pplx", "content": 57.22559411581902, "timestamp": "2025-09-05 09:04:55.199506", "step": 2020, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:55.360725", "step": 2020, "epoch": 2 }, { "type": "loss", "content": 0.29269856214523315, "timestamp": "2025-09-05 09:04:55.362490", "step": 2021, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:55.532762", "step": 2021, "epoch": 2 }, { "type": "loss", "content": 0.40585339069366455, "timestamp": "2025-09-05 09:04:55.535125", "step": 2022, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:55.741574", "step": 2022, "epoch": 2 }, { "type": "loss", "content": 0.4142700135707855, "timestamp": "2025-09-05 09:04:55.743607", "step": 2023, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:55.911896", "step": 2023, "epoch": 2 }, { "type": "loss", "content": 0.3398078680038452, "timestamp": "2025-09-05 09:04:55.927449", "step": 2024, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:56.115575", "step": 2024, "epoch": 2 }, { "type": "loss", "content": 0.27966639399528503, "timestamp": "2025-09-05 09:04:56.117340", "step": 2025, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:56.283716", "step": 2025, "epoch": 2 }, { "type": "loss", "content": 0.3459966480731964, "timestamp": "2025-09-05 09:04:56.285535", "step": 2026, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:56.479497", "step": 2026, "epoch": 2 }, { "type": "loss", "content": 0.26111602783203125, "timestamp": "2025-09-05 09:04:56.481512", "step": 2027, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:56.647710", "step": 2027, "epoch": 2 }, { "type": "loss", "content": 0.2696475386619568, "timestamp": "2025-09-05 09:04:56.663724", "step": 2028, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:56.855157", "step": 2028, "epoch": 2 }, { "type": "loss", "content": 0.44315868616104126, "timestamp": "2025-09-05 09:04:56.858681", "step": 2029, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:57.029209", "step": 2029, "epoch": 2 }, { "type": "loss", "content": 0.37835097312927246, "timestamp": "2025-09-05 09:04:57.031189", "step": 2030, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:04:57.198452", "step": 2030, "epoch": 2 }, { "type": "loss", "content": 0.4174140989780426, "timestamp": "2025-09-05 09:04:57.200571", "step": 2031, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:57.397154", "step": 2031, "epoch": 2 }, { "type": "loss", "content": 0.3270459473133087, "timestamp": "2025-09-05 09:04:57.407649", "step": 2032, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:57.569889", "step": 2032, "epoch": 2 }, { "type": "loss", "content": 0.22918665409088135, "timestamp": "2025-09-05 09:04:57.572029", "step": 2033, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:57.778081", "step": 2033, "epoch": 2 }, { "type": "loss", "content": 0.37719905376434326, "timestamp": "2025-09-05 09:04:57.780047", "step": 2034, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:04:57.985555", "step": 2034, "epoch": 2 }, { "type": "loss", "content": 0.32334667444229126, "timestamp": "2025-09-05 09:04:57.989649", "step": 2035, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:58.201644", "step": 2035, "epoch": 2 }, { "type": "loss", "content": 0.343068391084671, "timestamp": "2025-09-05 09:04:58.259790", "step": 2036, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:04:58.509098", "step": 2036, "epoch": 2 }, { "type": "loss", "content": 0.4084326922893524, "timestamp": "2025-09-05 09:04:58.511936", "step": 2037, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:04:58.717269", "step": 2037, "epoch": 2 }, { "type": "loss", "content": 0.24380765855312347, "timestamp": "2025-09-05 09:04:58.720014", "step": 2038, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:04:58.927965", "step": 2038, "epoch": 2 }, { "type": "loss", "content": 0.21494068205356598, "timestamp": "2025-09-05 09:04:58.930477", "step": 2039, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:04:59.102037", "step": 2039, "epoch": 2 }, { "type": "loss", "content": 0.24519406259059906, "timestamp": "2025-09-05 09:04:59.112573", "step": 2040, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:05:03.749817", "step": 2040, "epoch": 2 }, { "type": "pplx", "content": 56.85761600576899, "timestamp": "2025-09-05 09:05:03.751727", "step": 2040, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2040", "timestamp": "2025-09-05 09:05:04.212748", "step": 2040, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:04.379772", "step": 2040, "epoch": 2 }, { "type": "loss", "content": 0.21963070333003998, "timestamp": "2025-09-05 09:05:04.381772", "step": 2041, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:04.576449", "step": 2041, "epoch": 2 }, { "type": "loss", "content": 0.2468557208776474, "timestamp": "2025-09-05 09:05:04.580133", "step": 2042, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:04.775998", "step": 2042, "epoch": 2 }, { "type": "loss", "content": 0.38554346561431885, "timestamp": "2025-09-05 09:05:04.778595", "step": 2043, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:04.948336", "step": 2043, "epoch": 2 }, { "type": "loss", "content": 0.2700977325439453, "timestamp": "2025-09-05 09:05:04.965970", "step": 2044, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:05:05.163837", "step": 2044, "epoch": 2 }, { "type": "loss", "content": 0.32830366492271423, "timestamp": "2025-09-05 09:05:05.167027", "step": 2045, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:05.337485", "step": 2045, "epoch": 2 }, { "type": "loss", "content": 0.41252601146698, "timestamp": "2025-09-05 09:05:05.339559", "step": 2046, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:05.506711", "step": 2046, "epoch": 2 }, { "type": "loss", "content": 0.23501189053058624, "timestamp": "2025-09-05 09:05:05.509934", "step": 2047, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:05.676582", "step": 2047, "epoch": 2 }, { "type": "loss", "content": 0.406737744808197, "timestamp": "2025-09-05 09:05:05.686266", "step": 2048, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:05.850915", "step": 2048, "epoch": 2 }, { "type": "loss", "content": 0.3673907220363617, "timestamp": "2025-09-05 09:05:05.852489", "step": 2049, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:06.047152", "step": 2049, "epoch": 2 }, { "type": "loss", "content": 0.3338184058666229, "timestamp": "2025-09-05 09:05:06.050231", "step": 2050, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:06.254566", "step": 2050, "epoch": 2 }, { "type": "loss", "content": 0.3576391935348511, "timestamp": "2025-09-05 09:05:06.257351", "step": 2051, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:06.456315", "step": 2051, "epoch": 2 }, { "type": "loss", "content": 0.4063916504383087, "timestamp": "2025-09-05 09:05:06.465686", "step": 2052, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:06.630051", "step": 2052, "epoch": 2 }, { "type": "loss", "content": 0.30287304520606995, "timestamp": "2025-09-05 09:05:06.632774", "step": 2053, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:06.801724", "step": 2053, "epoch": 2 }, { "type": "loss", "content": 0.31338661909103394, "timestamp": "2025-09-05 09:05:06.804131", "step": 2054, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:07.000461", "step": 2054, "epoch": 2 }, { "type": "loss", "content": 0.30023157596588135, "timestamp": "2025-09-05 09:05:07.002435", "step": 2055, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:07.169863", "step": 2055, "epoch": 2 }, { "type": "loss", "content": 0.5755932927131653, "timestamp": "2025-09-05 09:05:07.187296", "step": 2056, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:07.385750", "step": 2056, "epoch": 2 }, { "type": "loss", "content": 0.44111013412475586, "timestamp": "2025-09-05 09:05:07.389213", "step": 2057, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:07.596791", "step": 2057, "epoch": 2 }, { "type": "loss", "content": 0.3329983651638031, "timestamp": "2025-09-05 09:05:07.598769", "step": 2058, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:07.794782", "step": 2058, "epoch": 2 }, { "type": "loss", "content": 0.24375692009925842, "timestamp": "2025-09-05 09:05:07.797646", "step": 2059, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:07.965368", "step": 2059, "epoch": 2 }, { "type": "loss", "content": 0.27790704369544983, "timestamp": "2025-09-05 09:05:07.982541", "step": 2060, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:05:12.660572", "step": 2060, "epoch": 2 }, { "type": "pplx", "content": 56.09090719183764, "timestamp": "2025-09-05 09:05:12.662701", "step": 2060, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:12.824471", "step": 2060, "epoch": 2 }, { "type": "loss", "content": 0.2383831888437271, "timestamp": "2025-09-05 09:05:12.826666", "step": 2061, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:12.993388", "step": 2061, "epoch": 2 }, { "type": "loss", "content": 0.3052305579185486, "timestamp": "2025-09-05 09:05:12.995388", "step": 2062, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:13.161817", "step": 2062, "epoch": 2 }, { "type": "loss", "content": 0.18823660910129547, "timestamp": "2025-09-05 09:05:13.166329", "step": 2063, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:13.365333", "step": 2063, "epoch": 2 }, { "type": "loss", "content": 0.28078800439834595, "timestamp": "2025-09-05 09:05:13.422659", "step": 2064, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:13.620690", "step": 2064, "epoch": 2 }, { "type": "loss", "content": 0.42778801918029785, "timestamp": "2025-09-05 09:05:13.622539", "step": 2065, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:13.818849", "step": 2065, "epoch": 2 }, { "type": "loss", "content": 0.3101421594619751, "timestamp": "2025-09-05 09:05:13.821061", "step": 2066, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:14.017518", "step": 2066, "epoch": 2 }, { "type": "loss", "content": 0.27489471435546875, "timestamp": "2025-09-05 09:05:14.020061", "step": 2067, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:14.228453", "step": 2067, "epoch": 2 }, { "type": "loss", "content": 0.23832768201828003, "timestamp": "2025-09-05 09:05:14.238406", "step": 2068, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:14.403227", "step": 2068, "epoch": 2 }, { "type": "loss", "content": 0.3375934064388275, "timestamp": "2025-09-05 09:05:14.405573", "step": 2069, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:14.610505", "step": 2069, "epoch": 2 }, { "type": "loss", "content": 0.3630978763103485, "timestamp": "2025-09-05 09:05:14.612484", "step": 2070, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:05:14.779489", "step": 2070, "epoch": 2 }, { "type": "loss", "content": 0.3400574326515198, "timestamp": "2025-09-05 09:05:14.781916", "step": 2071, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:14.980142", "step": 2071, "epoch": 2 }, { "type": "loss", "content": 0.40873268246650696, "timestamp": "2025-09-05 09:05:14.994736", "step": 2072, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:15.184588", "step": 2072, "epoch": 2 }, { "type": "loss", "content": 0.3188339173793793, "timestamp": "2025-09-05 09:05:15.187393", "step": 2073, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:15.356461", "step": 2073, "epoch": 2 }, { "type": "loss", "content": 0.2616739869117737, "timestamp": "2025-09-05 09:05:15.358806", "step": 2074, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:15.556729", "step": 2074, "epoch": 2 }, { "type": "loss", "content": 0.38481009006500244, "timestamp": "2025-09-05 09:05:15.558582", "step": 2075, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:15.727982", "step": 2075, "epoch": 2 }, { "type": "loss", "content": 0.3673509657382965, "timestamp": "2025-09-05 09:05:15.737912", "step": 2076, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:15.904494", "step": 2076, "epoch": 2 }, { "type": "loss", "content": 0.34988126158714294, "timestamp": "2025-09-05 09:05:15.906553", "step": 2077, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:16.111211", "step": 2077, "epoch": 2 }, { "type": "loss", "content": 0.262952983379364, "timestamp": "2025-09-05 09:05:16.113704", "step": 2078, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:16.280634", "step": 2078, "epoch": 2 }, { "type": "loss", "content": 0.33671560883522034, "timestamp": "2025-09-05 09:05:16.283544", "step": 2079, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:16.478700", "step": 2079, "epoch": 2 }, { "type": "loss", "content": 0.35512277483940125, "timestamp": "2025-09-05 09:05:16.488316", "step": 2080, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:05:21.153968", "step": 2080, "epoch": 2 }, { "type": "pplx", "content": 56.30905831955004, "timestamp": "2025-09-05 09:05:21.155970", "step": 2080, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2080", "timestamp": "2025-09-05 09:05:21.636265", "step": 2080, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:21.805059", "step": 2080, "epoch": 2 }, { "type": "loss", "content": 0.3978932201862335, "timestamp": "2025-09-05 09:05:21.806980", "step": 2081, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:21.973582", "step": 2081, "epoch": 2 }, { "type": "loss", "content": 0.23755325376987457, "timestamp": "2025-09-05 09:05:21.975776", "step": 2082, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:22.181802", "step": 2082, "epoch": 2 }, { "type": "loss", "content": 0.26282647252082825, "timestamp": "2025-09-05 09:05:22.184030", "step": 2083, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:22.390211", "step": 2083, "epoch": 2 }, { "type": "loss", "content": 0.22978627681732178, "timestamp": "2025-09-05 09:05:22.399814", "step": 2084, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:22.564860", "step": 2084, "epoch": 2 }, { "type": "loss", "content": 0.34872758388519287, "timestamp": "2025-09-05 09:05:22.566428", "step": 2085, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:22.733852", "step": 2085, "epoch": 2 }, { "type": "loss", "content": 0.26038891077041626, "timestamp": "2025-09-05 09:05:22.735811", "step": 2086, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:22.941243", "step": 2086, "epoch": 2 }, { "type": "loss", "content": 0.2772735357284546, "timestamp": "2025-09-05 09:05:22.943426", "step": 2087, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:23.171037", "step": 2087, "epoch": 2 }, { "type": "loss", "content": 0.3665330111980438, "timestamp": "2025-09-05 09:05:23.185480", "step": 2088, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:23.377188", "step": 2088, "epoch": 2 }, { "type": "loss", "content": 0.30773794651031494, "timestamp": "2025-09-05 09:05:23.378924", "step": 2089, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:23.547670", "step": 2089, "epoch": 2 }, { "type": "loss", "content": 0.3149157464504242, "timestamp": "2025-09-05 09:05:23.549514", "step": 2090, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:23.716414", "step": 2090, "epoch": 2 }, { "type": "loss", "content": 0.26793622970581055, "timestamp": "2025-09-05 09:05:23.718590", "step": 2091, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:23.915903", "step": 2091, "epoch": 2 }, { "type": "loss", "content": 0.23315511643886566, "timestamp": "2025-09-05 09:05:23.930531", "step": 2092, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:24.126973", "step": 2092, "epoch": 2 }, { "type": "loss", "content": 0.33972030878067017, "timestamp": "2025-09-05 09:05:24.129837", "step": 2093, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:24.328693", "step": 2093, "epoch": 2 }, { "type": "loss", "content": 0.24864479899406433, "timestamp": "2025-09-05 09:05:24.332201", "step": 2094, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:24.530634", "step": 2094, "epoch": 2 }, { "type": "loss", "content": 0.4089866876602173, "timestamp": "2025-09-05 09:05:24.532314", "step": 2095, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:24.700655", "step": 2095, "epoch": 2 }, { "type": "loss", "content": 0.36251306533813477, "timestamp": "2025-09-05 09:05:24.710674", "step": 2096, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:05:24.877319", "step": 2096, "epoch": 2 }, { "type": "loss", "content": 0.36653363704681396, "timestamp": "2025-09-05 09:05:24.880792", "step": 2097, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:25.090908", "step": 2097, "epoch": 2 }, { "type": "loss", "content": 0.2979757487773895, "timestamp": "2025-09-05 09:05:25.093136", "step": 2098, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:25.264142", "step": 2098, "epoch": 2 }, { "type": "loss", "content": 0.3006058633327484, "timestamp": "2025-09-05 09:05:25.266754", "step": 2099, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:25.476399", "step": 2099, "epoch": 2 }, { "type": "loss", "content": 0.3439769148826599, "timestamp": "2025-09-05 09:05:25.493166", "step": 2100, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:05:30.301851", "step": 2100, "epoch": 2 }, { "type": "pplx", "content": 55.87515440553129, "timestamp": "2025-09-05 09:05:30.304097", "step": 2100, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:05:30.467899", "step": 2100, "epoch": 2 }, { "type": "loss", "content": 0.3255309760570526, "timestamp": "2025-09-05 09:05:30.469795", "step": 2101, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:30.635767", "step": 2101, "epoch": 2 }, { "type": "loss", "content": 0.23818431794643402, "timestamp": "2025-09-05 09:05:30.637417", "step": 2102, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:30.804649", "step": 2102, "epoch": 2 }, { "type": "loss", "content": 0.2935701310634613, "timestamp": "2025-09-05 09:05:30.806664", "step": 2103, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:31.003143", "step": 2103, "epoch": 2 }, { "type": "loss", "content": 0.3238224983215332, "timestamp": "2025-09-05 09:05:31.017915", "step": 2104, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:31.207656", "step": 2104, "epoch": 2 }, { "type": "loss", "content": 0.26048019528388977, "timestamp": "2025-09-05 09:05:31.209412", "step": 2105, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:31.417385", "step": 2105, "epoch": 2 }, { "type": "loss", "content": 0.3144626319408417, "timestamp": "2025-09-05 09:05:31.419292", "step": 2106, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:31.615226", "step": 2106, "epoch": 2 }, { "type": "loss", "content": 0.31784558296203613, "timestamp": "2025-09-05 09:05:31.617380", "step": 2107, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:31.782250", "step": 2107, "epoch": 2 }, { "type": "loss", "content": 0.3896116018295288, "timestamp": "2025-09-05 09:05:31.799351", "step": 2108, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:31.999348", "step": 2108, "epoch": 2 }, { "type": "loss", "content": 0.5450838208198547, "timestamp": "2025-09-05 09:05:32.001492", "step": 2109, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:32.207332", "step": 2109, "epoch": 2 }, { "type": "loss", "content": 0.4268830716609955, "timestamp": "2025-09-05 09:05:32.209190", "step": 2110, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:32.376126", "step": 2110, "epoch": 2 }, { "type": "loss", "content": 0.2802806496620178, "timestamp": "2025-09-05 09:05:32.379085", "step": 2111, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:32.582723", "step": 2111, "epoch": 2 }, { "type": "loss", "content": 0.3529273271560669, "timestamp": "2025-09-05 09:05:32.597710", "step": 2112, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:32.786023", "step": 2112, "epoch": 2 }, { "type": "loss", "content": 0.37120527029037476, "timestamp": "2025-09-05 09:05:32.788315", "step": 2113, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:32.987392", "step": 2113, "epoch": 2 }, { "type": "loss", "content": 0.2828807532787323, "timestamp": "2025-09-05 09:05:32.989258", "step": 2114, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:33.197110", "step": 2114, "epoch": 2 }, { "type": "loss", "content": 0.3188009262084961, "timestamp": "2025-09-05 09:05:33.198841", "step": 2115, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:33.397741", "step": 2115, "epoch": 2 }, { "type": "loss", "content": 0.30147457122802734, "timestamp": "2025-09-05 09:05:33.414853", "step": 2116, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:33.606839", "step": 2116, "epoch": 2 }, { "type": "loss", "content": 0.39994457364082336, "timestamp": "2025-09-05 09:05:33.609368", "step": 2117, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:33.814464", "step": 2117, "epoch": 2 }, { "type": "loss", "content": 0.29162168502807617, "timestamp": "2025-09-05 09:05:33.816904", "step": 2118, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:34.012704", "step": 2118, "epoch": 2 }, { "type": "loss", "content": 0.36326169967651367, "timestamp": "2025-09-05 09:05:34.015173", "step": 2119, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:34.220510", "step": 2119, "epoch": 2 }, { "type": "loss", "content": 0.39839819073677063, "timestamp": "2025-09-05 09:05:34.238310", "step": 2120, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:05:38.962153", "step": 2120, "epoch": 2 }, { "type": "pplx", "content": 55.023435362113, "timestamp": "2025-09-05 09:05:38.964199", "step": 2120, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2120", "timestamp": "2025-09-05 09:05:39.426004", "step": 2120, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:39.590655", "step": 2120, "epoch": 2 }, { "type": "loss", "content": 0.298641175031662, "timestamp": "2025-09-05 09:05:39.592486", "step": 2121, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:39.762317", "step": 2121, "epoch": 2 }, { "type": "loss", "content": 0.2771815061569214, "timestamp": "2025-09-05 09:05:39.764288", "step": 2122, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:39.970270", "step": 2122, "epoch": 2 }, { "type": "loss", "content": 0.3446735143661499, "timestamp": "2025-09-05 09:05:39.973486", "step": 2123, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:40.142076", "step": 2123, "epoch": 2 }, { "type": "loss", "content": 0.32573774456977844, "timestamp": "2025-09-05 09:05:40.158806", "step": 2124, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:40.355566", "step": 2124, "epoch": 2 }, { "type": "loss", "content": 0.3329891264438629, "timestamp": "2025-09-05 09:05:40.357398", "step": 2125, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:40.524554", "step": 2125, "epoch": 2 }, { "type": "loss", "content": 0.3262787163257599, "timestamp": "2025-09-05 09:05:40.526910", "step": 2126, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:40.724145", "step": 2126, "epoch": 2 }, { "type": "loss", "content": 0.28894031047821045, "timestamp": "2025-09-05 09:05:40.726046", "step": 2127, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:40.894443", "step": 2127, "epoch": 2 }, { "type": "loss", "content": 0.37630510330200195, "timestamp": "2025-09-05 09:05:40.908812", "step": 2128, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:41.096804", "step": 2128, "epoch": 2 }, { "type": "loss", "content": 0.31172171235084534, "timestamp": "2025-09-05 09:05:41.098750", "step": 2129, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:41.294440", "step": 2129, "epoch": 2 }, { "type": "loss", "content": 0.3221554756164551, "timestamp": "2025-09-05 09:05:41.296586", "step": 2130, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:41.464390", "step": 2130, "epoch": 2 }, { "type": "loss", "content": 0.22313398122787476, "timestamp": "2025-09-05 09:05:41.466652", "step": 2131, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:41.671802", "step": 2131, "epoch": 2 }, { "type": "loss", "content": 0.27833616733551025, "timestamp": "2025-09-05 09:05:41.688374", "step": 2132, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:41.888539", "step": 2132, "epoch": 2 }, { "type": "loss", "content": 0.35830333828926086, "timestamp": "2025-09-05 09:05:41.890675", "step": 2133, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:42.057035", "step": 2133, "epoch": 2 }, { "type": "loss", "content": 0.3745828866958618, "timestamp": "2025-09-05 09:05:42.059027", "step": 2134, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:42.256318", "step": 2134, "epoch": 2 }, { "type": "loss", "content": 0.4300999939441681, "timestamp": "2025-09-05 09:05:42.258212", "step": 2135, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:42.425915", "step": 2135, "epoch": 2 }, { "type": "loss", "content": 0.4072500169277191, "timestamp": "2025-09-05 09:05:42.440937", "step": 2136, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:42.629269", "step": 2136, "epoch": 2 }, { "type": "loss", "content": 0.2526228129863739, "timestamp": "2025-09-05 09:05:42.631217", "step": 2137, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:42.837475", "step": 2137, "epoch": 2 }, { "type": "loss", "content": 0.39318081736564636, "timestamp": "2025-09-05 09:05:42.839334", "step": 2138, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:43.005908", "step": 2138, "epoch": 2 }, { "type": "loss", "content": 0.2297356128692627, "timestamp": "2025-09-05 09:05:43.008470", "step": 2139, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:05:43.206987", "step": 2139, "epoch": 2 }, { "type": "loss", "content": 0.33987030386924744, "timestamp": "2025-09-05 09:05:43.216353", "step": 2140, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:05:47.865119", "step": 2140, "epoch": 2 }, { "type": "pplx", "content": 55.06809153484284, "timestamp": "2025-09-05 09:05:47.867422", "step": 2140, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:48.032484", "step": 2140, "epoch": 2 }, { "type": "loss", "content": 0.4749826192855835, "timestamp": "2025-09-05 09:05:48.034336", "step": 2141, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:48.200139", "step": 2141, "epoch": 2 }, { "type": "loss", "content": 0.28325966000556946, "timestamp": "2025-09-05 09:05:48.201779", "step": 2142, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:48.369121", "step": 2142, "epoch": 2 }, { "type": "loss", "content": 0.42103084921836853, "timestamp": "2025-09-05 09:05:48.370850", "step": 2143, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:48.566526", "step": 2143, "epoch": 2 }, { "type": "loss", "content": 0.38202551007270813, "timestamp": "2025-09-05 09:05:48.576121", "step": 2144, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:48.738085", "step": 2144, "epoch": 2 }, { "type": "loss", "content": 0.3595607578754425, "timestamp": "2025-09-05 09:05:48.740049", "step": 2145, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:48.947701", "step": 2145, "epoch": 2 }, { "type": "loss", "content": 0.3413144648075104, "timestamp": "2025-09-05 09:05:48.949394", "step": 2146, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:49.156351", "step": 2146, "epoch": 2 }, { "type": "loss", "content": 0.24180643260478973, "timestamp": "2025-09-05 09:05:49.157912", "step": 2147, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:49.325916", "step": 2147, "epoch": 2 }, { "type": "loss", "content": 0.2788175344467163, "timestamp": "2025-09-05 09:05:49.342534", "step": 2148, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:49.540402", "step": 2148, "epoch": 2 }, { "type": "loss", "content": 0.39272475242614746, "timestamp": "2025-09-05 09:05:49.542346", "step": 2149, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:49.709083", "step": 2149, "epoch": 2 }, { "type": "loss", "content": 0.2623364329338074, "timestamp": "2025-09-05 09:05:49.711039", "step": 2150, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:49.919434", "step": 2150, "epoch": 2 }, { "type": "loss", "content": 0.21915015578269958, "timestamp": "2025-09-05 09:05:49.921268", "step": 2151, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:50.088631", "step": 2151, "epoch": 2 }, { "type": "loss", "content": 0.38461872935295105, "timestamp": "2025-09-05 09:05:50.105713", "step": 2152, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:50.302548", "step": 2152, "epoch": 2 }, { "type": "loss", "content": 0.4068615138530731, "timestamp": "2025-09-05 09:05:50.304488", "step": 2153, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:50.472419", "step": 2153, "epoch": 2 }, { "type": "loss", "content": 0.3176921606063843, "timestamp": "2025-09-05 09:05:50.474129", "step": 2154, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:50.681724", "step": 2154, "epoch": 2 }, { "type": "loss", "content": 0.34700101613998413, "timestamp": "2025-09-05 09:05:50.683454", "step": 2155, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:50.851655", "step": 2155, "epoch": 2 }, { "type": "loss", "content": 0.3761594891548157, "timestamp": "2025-09-05 09:05:50.869173", "step": 2156, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:51.068367", "step": 2156, "epoch": 2 }, { "type": "loss", "content": 0.1952638030052185, "timestamp": "2025-09-05 09:05:51.070604", "step": 2157, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:05:51.320283", "step": 2157, "epoch": 2 }, { "type": "loss", "content": 0.4499374330043793, "timestamp": "2025-09-05 09:05:51.322342", "step": 2158, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:51.521209", "step": 2158, "epoch": 2 }, { "type": "loss", "content": 0.2834685146808624, "timestamp": "2025-09-05 09:05:51.523072", "step": 2159, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:51.731601", "step": 2159, "epoch": 2 }, { "type": "loss", "content": 0.27654868364334106, "timestamp": "2025-09-05 09:05:51.745727", "step": 2160, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:05:57.239086", "step": 2160, "epoch": 2 }, { "type": "pplx", "content": 55.48316011694305, "timestamp": "2025-09-05 09:05:57.242217", "step": 2160, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2160", "timestamp": "2025-09-05 09:05:57.693603", "step": 2160, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:57.858500", "step": 2160, "epoch": 2 }, { "type": "loss", "content": 0.35953032970428467, "timestamp": "2025-09-05 09:05:57.860617", "step": 2161, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:58.029205", "step": 2161, "epoch": 2 }, { "type": "loss", "content": 0.4004409611225128, "timestamp": "2025-09-05 09:05:58.030979", "step": 2162, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:58.198364", "step": 2162, "epoch": 2 }, { "type": "loss", "content": 0.45261430740356445, "timestamp": "2025-09-05 09:05:58.200549", "step": 2163, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:58.367098", "step": 2163, "epoch": 2 }, { "type": "loss", "content": 0.1983097940683365, "timestamp": "2025-09-05 09:05:58.382083", "step": 2164, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:58.571827", "step": 2164, "epoch": 2 }, { "type": "loss", "content": 0.2994425594806671, "timestamp": "2025-09-05 09:05:58.573488", "step": 2165, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:05:58.743371", "step": 2165, "epoch": 2 }, { "type": "loss", "content": 0.3872717618942261, "timestamp": "2025-09-05 09:05:58.745170", "step": 2166, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:58.953533", "step": 2166, "epoch": 2 }, { "type": "loss", "content": 0.2745550572872162, "timestamp": "2025-09-05 09:05:58.955400", "step": 2167, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:05:59.164803", "step": 2167, "epoch": 2 }, { "type": "loss", "content": 0.3339973986148834, "timestamp": "2025-09-05 09:05:59.180082", "step": 2168, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:05:59.375968", "step": 2168, "epoch": 2 }, { "type": "loss", "content": 0.4272925555706024, "timestamp": "2025-09-05 09:05:59.378179", "step": 2169, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:59.586272", "step": 2169, "epoch": 2 }, { "type": "loss", "content": 0.2760466933250427, "timestamp": "2025-09-05 09:05:59.588264", "step": 2170, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:59.795018", "step": 2170, "epoch": 2 }, { "type": "loss", "content": 0.25868964195251465, "timestamp": "2025-09-05 09:05:59.796794", "step": 2171, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:05:59.992597", "step": 2171, "epoch": 2 }, { "type": "loss", "content": 0.2131020873785019, "timestamp": "2025-09-05 09:06:00.009053", "step": 2172, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:00.205797", "step": 2172, "epoch": 2 }, { "type": "loss", "content": 0.3480170667171478, "timestamp": "2025-09-05 09:06:00.207987", "step": 2173, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:00.375416", "step": 2173, "epoch": 2 }, { "type": "loss", "content": 0.38949131965637207, "timestamp": "2025-09-05 09:06:00.377072", "step": 2174, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:06:00.582310", "step": 2174, "epoch": 2 }, { "type": "loss", "content": 0.3337663412094116, "timestamp": "2025-09-05 09:06:00.584027", "step": 2175, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:00.780812", "step": 2175, "epoch": 2 }, { "type": "loss", "content": 0.48506397008895874, "timestamp": "2025-09-05 09:06:00.796080", "step": 2176, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:00.988006", "step": 2176, "epoch": 2 }, { "type": "loss", "content": 0.34281837940216064, "timestamp": "2025-09-05 09:06:00.989875", "step": 2177, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:01.188090", "step": 2177, "epoch": 2 }, { "type": "loss", "content": 0.2596714496612549, "timestamp": "2025-09-05 09:06:01.189855", "step": 2178, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:01.360399", "step": 2178, "epoch": 2 }, { "type": "loss", "content": 0.20409443974494934, "timestamp": "2025-09-05 09:06:01.362078", "step": 2179, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:01.567175", "step": 2179, "epoch": 2 }, { "type": "loss", "content": 0.34518057107925415, "timestamp": "2025-09-05 09:06:01.581103", "step": 2180, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:06:06.460758", "step": 2180, "epoch": 2 }, { "type": "pplx", "content": 55.87232859467735, "timestamp": "2025-09-05 09:06:06.462417", "step": 2180, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:06:06.626251", "step": 2180, "epoch": 2 }, { "type": "loss", "content": 0.3369835317134857, "timestamp": "2025-09-05 09:06:06.628361", "step": 2181, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:06:06.797342", "step": 2181, "epoch": 2 }, { "type": "loss", "content": 0.3712189793586731, "timestamp": "2025-09-05 09:06:06.798930", "step": 2182, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:06.969597", "step": 2182, "epoch": 2 }, { "type": "loss", "content": 0.3187856376171112, "timestamp": "2025-09-05 09:06:06.972251", "step": 2183, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:07.140449", "step": 2183, "epoch": 2 }, { "type": "loss", "content": 0.20823392271995544, "timestamp": "2025-09-05 09:06:07.158686", "step": 2184, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:06:07.365847", "step": 2184, "epoch": 2 }, { "type": "loss", "content": 0.3842635154724121, "timestamp": "2025-09-05 09:06:07.372022", "step": 2185, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:07.583304", "step": 2185, "epoch": 2 }, { "type": "loss", "content": 0.24682661890983582, "timestamp": "2025-09-05 09:06:07.586882", "step": 2186, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:07.785086", "step": 2186, "epoch": 2 }, { "type": "loss", "content": 0.32766950130462646, "timestamp": "2025-09-05 09:06:07.788027", "step": 2187, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:07.989795", "step": 2187, "epoch": 2 }, { "type": "loss", "content": 0.49860307574272156, "timestamp": "2025-09-05 09:06:08.007831", "step": 2188, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:08.211143", "step": 2188, "epoch": 2 }, { "type": "loss", "content": 0.4436277449131012, "timestamp": "2025-09-05 09:06:08.213455", "step": 2189, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:08.420670", "step": 2189, "epoch": 2 }, { "type": "loss", "content": 0.24359887838363647, "timestamp": "2025-09-05 09:06:08.423027", "step": 2190, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:08.629797", "step": 2190, "epoch": 2 }, { "type": "loss", "content": 0.2521995007991791, "timestamp": "2025-09-05 09:06:08.632822", "step": 2191, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:08.799043", "step": 2191, "epoch": 2 }, { "type": "loss", "content": 0.4108291566371918, "timestamp": "2025-09-05 09:06:08.814395", "step": 2192, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:09.001706", "step": 2192, "epoch": 2 }, { "type": "loss", "content": 0.4450414478778839, "timestamp": "2025-09-05 09:06:09.004367", "step": 2193, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:09.174249", "step": 2193, "epoch": 2 }, { "type": "loss", "content": 0.3344271779060364, "timestamp": "2025-09-05 09:06:09.176943", "step": 2194, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:09.385077", "step": 2194, "epoch": 2 }, { "type": "loss", "content": 0.3703814744949341, "timestamp": "2025-09-05 09:06:09.388491", "step": 2195, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:09.556016", "step": 2195, "epoch": 2 }, { "type": "loss", "content": 0.44833752512931824, "timestamp": "2025-09-05 09:06:09.566350", "step": 2196, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:09.730307", "step": 2196, "epoch": 2 }, { "type": "loss", "content": 0.3722361922264099, "timestamp": "2025-09-05 09:06:09.732642", "step": 2197, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:09.938807", "step": 2197, "epoch": 2 }, { "type": "loss", "content": 0.43370842933654785, "timestamp": "2025-09-05 09:06:09.941992", "step": 2198, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:10.138574", "step": 2198, "epoch": 2 }, { "type": "loss", "content": 0.27109774947166443, "timestamp": "2025-09-05 09:06:10.142016", "step": 2199, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:10.347324", "step": 2199, "epoch": 2 }, { "type": "loss", "content": 0.23769542574882507, "timestamp": "2025-09-05 09:06:10.365235", "step": 2200, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:06:15.248799", "step": 2200, "epoch": 2 }, { "type": "pplx", "content": 55.81018461535365, "timestamp": "2025-09-05 09:06:15.251062", "step": 2200, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2200", "timestamp": "2025-09-05 09:06:15.710155", "step": 2200, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:06:15.875503", "step": 2200, "epoch": 2 }, { "type": "loss", "content": 0.3465867042541504, "timestamp": "2025-09-05 09:06:15.877472", "step": 2201, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:16.045988", "step": 2201, "epoch": 2 }, { "type": "loss", "content": 0.28614541888237, "timestamp": "2025-09-05 09:06:16.047770", "step": 2202, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:16.215359", "step": 2202, "epoch": 2 }, { "type": "loss", "content": 0.2435327023267746, "timestamp": "2025-09-05 09:06:16.217829", "step": 2203, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:16.412558", "step": 2203, "epoch": 2 }, { "type": "loss", "content": 0.28242167830467224, "timestamp": "2025-09-05 09:06:16.429917", "step": 2204, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:16.627712", "step": 2204, "epoch": 2 }, { "type": "loss", "content": 0.40695592761039734, "timestamp": "2025-09-05 09:06:16.629514", "step": 2205, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:16.796399", "step": 2205, "epoch": 2 }, { "type": "loss", "content": 0.2968638241291046, "timestamp": "2025-09-05 09:06:16.798607", "step": 2206, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:16.995784", "step": 2206, "epoch": 2 }, { "type": "loss", "content": 0.29859426617622375, "timestamp": "2025-09-05 09:06:16.997868", "step": 2207, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:17.204998", "step": 2207, "epoch": 2 }, { "type": "loss", "content": 0.2591385841369629, "timestamp": "2025-09-05 09:06:17.219261", "step": 2208, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:17.409143", "step": 2208, "epoch": 2 }, { "type": "loss", "content": 0.3400825262069702, "timestamp": "2025-09-05 09:06:17.440817", "step": 2209, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:17.693884", "step": 2209, "epoch": 2 }, { "type": "loss", "content": 0.3577132821083069, "timestamp": "2025-09-05 09:06:17.695615", "step": 2210, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:17.895401", "step": 2210, "epoch": 2 }, { "type": "loss", "content": 0.346355140209198, "timestamp": "2025-09-05 09:06:17.897200", "step": 2211, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:06:18.065043", "step": 2211, "epoch": 2 }, { "type": "loss", "content": 0.302179217338562, "timestamp": "2025-09-05 09:06:18.081108", "step": 2212, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:18.271084", "step": 2212, "epoch": 2 }, { "type": "loss", "content": 0.27992647886276245, "timestamp": "2025-09-05 09:06:18.273478", "step": 2213, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:18.440962", "step": 2213, "epoch": 2 }, { "type": "loss", "content": 0.2065676599740982, "timestamp": "2025-09-05 09:06:18.443396", "step": 2214, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:18.655269", "step": 2214, "epoch": 2 }, { "type": "loss", "content": 0.28311607241630554, "timestamp": "2025-09-05 09:06:18.657127", "step": 2215, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:18.824537", "step": 2215, "epoch": 2 }, { "type": "loss", "content": 0.3312028646469116, "timestamp": "2025-09-05 09:06:18.838942", "step": 2216, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:19.028366", "step": 2216, "epoch": 2 }, { "type": "loss", "content": 0.2649993896484375, "timestamp": "2025-09-05 09:06:19.030572", "step": 2217, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:19.239728", "step": 2217, "epoch": 2 }, { "type": "loss", "content": 0.2982449233531952, "timestamp": "2025-09-05 09:06:19.241520", "step": 2218, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:19.450240", "step": 2218, "epoch": 2 }, { "type": "loss", "content": 0.27559399604797363, "timestamp": "2025-09-05 09:06:19.454792", "step": 2219, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:19.653922", "step": 2219, "epoch": 2 }, { "type": "loss", "content": 0.28188076615333557, "timestamp": "2025-09-05 09:06:19.671388", "step": 2220, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:06:24.361464", "step": 2220, "epoch": 2 }, { "type": "pplx", "content": 55.25948775718098, "timestamp": "2025-09-05 09:06:24.364366", "step": 2220, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:24.528802", "step": 2220, "epoch": 2 }, { "type": "loss", "content": 0.28795936703681946, "timestamp": "2025-09-05 09:06:24.530496", "step": 2221, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:24.697761", "step": 2221, "epoch": 2 }, { "type": "loss", "content": 0.40923991799354553, "timestamp": "2025-09-05 09:06:24.699852", "step": 2222, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:24.867605", "step": 2222, "epoch": 2 }, { "type": "loss", "content": 0.31736886501312256, "timestamp": "2025-09-05 09:06:24.869794", "step": 2223, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:25.036879", "step": 2223, "epoch": 2 }, { "type": "loss", "content": 0.2096620500087738, "timestamp": "2025-09-05 09:06:25.051680", "step": 2224, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:25.239543", "step": 2224, "epoch": 2 }, { "type": "loss", "content": 0.484735906124115, "timestamp": "2025-09-05 09:06:25.241461", "step": 2225, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:25.438820", "step": 2225, "epoch": 2 }, { "type": "loss", "content": 0.34063124656677246, "timestamp": "2025-09-05 09:06:25.441718", "step": 2226, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:25.637874", "step": 2226, "epoch": 2 }, { "type": "loss", "content": 0.29765599966049194, "timestamp": "2025-09-05 09:06:25.639588", "step": 2227, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:25.805672", "step": 2227, "epoch": 2 }, { "type": "loss", "content": 0.24793501198291779, "timestamp": "2025-09-05 09:06:25.814750", "step": 2228, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:25.976719", "step": 2228, "epoch": 2 }, { "type": "loss", "content": 0.33141300082206726, "timestamp": "2025-09-05 09:06:25.979149", "step": 2229, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:26.186014", "step": 2229, "epoch": 2 }, { "type": "loss", "content": 0.3666149079799652, "timestamp": "2025-09-05 09:06:26.188475", "step": 2230, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:26.475436", "step": 2230, "epoch": 2 }, { "type": "loss", "content": 0.23982176184654236, "timestamp": "2025-09-05 09:06:26.477600", "step": 2231, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:26.673606", "step": 2231, "epoch": 2 }, { "type": "loss", "content": 0.23586024343967438, "timestamp": "2025-09-05 09:06:26.684063", "step": 2232, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:26.847693", "step": 2232, "epoch": 2 }, { "type": "loss", "content": 0.22031444311141968, "timestamp": "2025-09-05 09:06:26.849808", "step": 2233, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:27.016397", "step": 2233, "epoch": 2 }, { "type": "loss", "content": 0.2034074366092682, "timestamp": "2025-09-05 09:06:27.018349", "step": 2234, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:27.224905", "step": 2234, "epoch": 2 }, { "type": "loss", "content": 0.3312498927116394, "timestamp": "2025-09-05 09:06:27.226712", "step": 2235, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:27.393791", "step": 2235, "epoch": 2 }, { "type": "loss", "content": 0.16999323666095734, "timestamp": "2025-09-05 09:06:27.410448", "step": 2236, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:27.606051", "step": 2236, "epoch": 2 }, { "type": "loss", "content": 0.262175053358078, "timestamp": "2025-09-05 09:06:27.607922", "step": 2237, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:27.804022", "step": 2237, "epoch": 2 }, { "type": "loss", "content": 0.3137682378292084, "timestamp": "2025-09-05 09:06:27.805913", "step": 2238, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:28.012450", "step": 2238, "epoch": 2 }, { "type": "loss", "content": 0.40954217314720154, "timestamp": "2025-09-05 09:06:28.014922", "step": 2239, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:28.182501", "step": 2239, "epoch": 2 }, { "type": "loss", "content": 0.38842806220054626, "timestamp": "2025-09-05 09:06:28.199729", "step": 2240, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:06:33.159795", "step": 2240, "epoch": 2 }, { "type": "pplx", "content": 54.81411080714311, "timestamp": "2025-09-05 09:06:33.161733", "step": 2240, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2240", "timestamp": "2025-09-05 09:06:33.625419", "step": 2240, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:33.789231", "step": 2240, "epoch": 2 }, { "type": "loss", "content": 0.30577048659324646, "timestamp": "2025-09-05 09:06:33.791819", "step": 2241, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:33.959669", "step": 2241, "epoch": 2 }, { "type": "loss", "content": 0.2516123950481415, "timestamp": "2025-09-05 09:06:33.961351", "step": 2242, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:34.128513", "step": 2242, "epoch": 2 }, { "type": "loss", "content": 0.29832541942596436, "timestamp": "2025-09-05 09:06:34.130926", "step": 2243, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:34.326975", "step": 2243, "epoch": 2 }, { "type": "loss", "content": 0.352728009223938, "timestamp": "2025-09-05 09:06:34.336301", "step": 2244, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:34.500913", "step": 2244, "epoch": 2 }, { "type": "loss", "content": 0.29814958572387695, "timestamp": "2025-09-05 09:06:34.503201", "step": 2245, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:34.671218", "step": 2245, "epoch": 2 }, { "type": "loss", "content": 0.3409227728843689, "timestamp": "2025-09-05 09:06:34.673286", "step": 2246, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:06:34.867144", "step": 2246, "epoch": 2 }, { "type": "loss", "content": 0.37319034337997437, "timestamp": "2025-09-05 09:06:34.869145", "step": 2247, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:35.035883", "step": 2247, "epoch": 2 }, { "type": "loss", "content": 0.378897100687027, "timestamp": "2025-09-05 09:06:35.050269", "step": 2248, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:06:35.241028", "step": 2248, "epoch": 2 }, { "type": "loss", "content": 0.47297292947769165, "timestamp": "2025-09-05 09:06:35.243250", "step": 2249, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:35.448153", "step": 2249, "epoch": 2 }, { "type": "loss", "content": 0.2587435245513916, "timestamp": "2025-09-05 09:06:35.450194", "step": 2250, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:35.620591", "step": 2250, "epoch": 2 }, { "type": "loss", "content": 0.21650661528110504, "timestamp": "2025-09-05 09:06:35.622564", "step": 2251, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:35.819398", "step": 2251, "epoch": 2 }, { "type": "loss", "content": 0.3861205279827118, "timestamp": "2025-09-05 09:06:35.834116", "step": 2252, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:36.023040", "step": 2252, "epoch": 2 }, { "type": "loss", "content": 0.24851293861865997, "timestamp": "2025-09-05 09:06:36.025229", "step": 2253, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:36.193242", "step": 2253, "epoch": 2 }, { "type": "loss", "content": 0.38230419158935547, "timestamp": "2025-09-05 09:06:36.195357", "step": 2254, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:36.401772", "step": 2254, "epoch": 2 }, { "type": "loss", "content": 0.23924830555915833, "timestamp": "2025-09-05 09:06:36.403897", "step": 2255, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:36.601332", "step": 2255, "epoch": 2 }, { "type": "loss", "content": 0.23087681829929352, "timestamp": "2025-09-05 09:06:36.616592", "step": 2256, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:36.803359", "step": 2256, "epoch": 2 }, { "type": "loss", "content": 0.31772562861442566, "timestamp": "2025-09-05 09:06:36.805156", "step": 2257, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:37.000355", "step": 2257, "epoch": 2 }, { "type": "loss", "content": 0.37587422132492065, "timestamp": "2025-09-05 09:06:37.002348", "step": 2258, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:37.169293", "step": 2258, "epoch": 2 }, { "type": "loss", "content": 0.5363959074020386, "timestamp": "2025-09-05 09:06:37.171488", "step": 2259, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:06:37.376633", "step": 2259, "epoch": 2 }, { "type": "loss", "content": 0.3956094980239868, "timestamp": "2025-09-05 09:06:37.393078", "step": 2260, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:06:42.110986", "step": 2260, "epoch": 2 }, { "type": "pplx", "content": 54.873000385281806, "timestamp": "2025-09-05 09:06:42.113444", "step": 2260, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:42.274354", "step": 2260, "epoch": 2 }, { "type": "loss", "content": 0.3519768714904785, "timestamp": "2025-09-05 09:06:42.276425", "step": 2261, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:42.526981", "step": 2261, "epoch": 2 }, { "type": "loss", "content": 0.2870665192604065, "timestamp": "2025-09-05 09:06:42.528820", "step": 2262, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:42.732781", "step": 2262, "epoch": 2 }, { "type": "loss", "content": 0.3545304238796234, "timestamp": "2025-09-05 09:06:42.734675", "step": 2263, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:42.932972", "step": 2263, "epoch": 2 }, { "type": "loss", "content": 0.2984987497329712, "timestamp": "2025-09-05 09:06:42.943221", "step": 2264, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:43.105999", "step": 2264, "epoch": 2 }, { "type": "loss", "content": 0.3972173035144806, "timestamp": "2025-09-05 09:06:43.108100", "step": 2265, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:43.275800", "step": 2265, "epoch": 2 }, { "type": "loss", "content": 0.3706391751766205, "timestamp": "2025-09-05 09:06:43.277701", "step": 2266, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:43.471167", "step": 2266, "epoch": 2 }, { "type": "loss", "content": 0.36558616161346436, "timestamp": "2025-09-05 09:06:43.473657", "step": 2267, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:43.641189", "step": 2267, "epoch": 2 }, { "type": "loss", "content": 0.2613338232040405, "timestamp": "2025-09-05 09:06:43.658410", "step": 2268, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:06:43.856335", "step": 2268, "epoch": 2 }, { "type": "loss", "content": 0.3643812835216522, "timestamp": "2025-09-05 09:06:43.858525", "step": 2269, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:44.057101", "step": 2269, "epoch": 2 }, { "type": "loss", "content": 0.3404228091239929, "timestamp": "2025-09-05 09:06:44.059038", "step": 2270, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:44.228781", "step": 2270, "epoch": 2 }, { "type": "loss", "content": 0.4255596101284027, "timestamp": "2025-09-05 09:06:44.230693", "step": 2271, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:44.399436", "step": 2271, "epoch": 2 }, { "type": "loss", "content": 0.3181018829345703, "timestamp": "2025-09-05 09:06:44.408822", "step": 2272, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:44.571743", "step": 2272, "epoch": 2 }, { "type": "loss", "content": 0.2214895486831665, "timestamp": "2025-09-05 09:06:44.573526", "step": 2273, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:44.740584", "step": 2273, "epoch": 2 }, { "type": "loss", "content": 0.34708890318870544, "timestamp": "2025-09-05 09:06:44.742388", "step": 2274, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:44.938691", "step": 2274, "epoch": 2 }, { "type": "loss", "content": 0.39245232939720154, "timestamp": "2025-09-05 09:06:44.940467", "step": 2275, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:45.109042", "step": 2275, "epoch": 2 }, { "type": "loss", "content": 0.30886292457580566, "timestamp": "2025-09-05 09:06:45.126949", "step": 2276, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:45.324958", "step": 2276, "epoch": 2 }, { "type": "loss", "content": 0.3094596564769745, "timestamp": "2025-09-05 09:06:45.327565", "step": 2277, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:45.532781", "step": 2277, "epoch": 2 }, { "type": "loss", "content": 0.3498867452144623, "timestamp": "2025-09-05 09:06:45.534636", "step": 2278, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:45.741366", "step": 2278, "epoch": 2 }, { "type": "loss", "content": 0.4171161651611328, "timestamp": "2025-09-05 09:06:45.743093", "step": 2279, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:45.912226", "step": 2279, "epoch": 2 }, { "type": "loss", "content": 0.3185728192329407, "timestamp": "2025-09-05 09:06:45.927001", "step": 2280, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:06:50.639787", "step": 2280, "epoch": 2 }, { "type": "pplx", "content": 54.187925265235314, "timestamp": "2025-09-05 09:06:50.641591", "step": 2280, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2280", "timestamp": "2025-09-05 09:06:51.097949", "step": 2280, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:51.263440", "step": 2280, "epoch": 2 }, { "type": "loss", "content": 0.31183862686157227, "timestamp": "2025-09-05 09:06:51.265652", "step": 2281, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:51.433139", "step": 2281, "epoch": 2 }, { "type": "loss", "content": 0.39186206459999084, "timestamp": "2025-09-05 09:06:51.435416", "step": 2282, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:51.642913", "step": 2282, "epoch": 2 }, { "type": "loss", "content": 0.30460259318351746, "timestamp": "2025-09-05 09:06:51.644628", "step": 2283, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:51.811414", "step": 2283, "epoch": 2 }, { "type": "loss", "content": 0.3289051055908203, "timestamp": "2025-09-05 09:06:51.827980", "step": 2284, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:52.021861", "step": 2284, "epoch": 2 }, { "type": "loss", "content": 0.3493216335773468, "timestamp": "2025-09-05 09:06:52.023824", "step": 2285, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:52.188519", "step": 2285, "epoch": 2 }, { "type": "loss", "content": 0.1778910756111145, "timestamp": "2025-09-05 09:06:52.190599", "step": 2286, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:52.388190", "step": 2286, "epoch": 2 }, { "type": "loss", "content": 0.22400565445423126, "timestamp": "2025-09-05 09:06:52.390250", "step": 2287, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:52.595404", "step": 2287, "epoch": 2 }, { "type": "loss", "content": 0.2981906533241272, "timestamp": "2025-09-05 09:06:52.604955", "step": 2288, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:52.767428", "step": 2288, "epoch": 2 }, { "type": "loss", "content": 0.26950085163116455, "timestamp": "2025-09-05 09:06:52.769577", "step": 2289, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:06:52.935744", "step": 2289, "epoch": 2 }, { "type": "loss", "content": 0.29919737577438354, "timestamp": "2025-09-05 09:06:52.937652", "step": 2290, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:53.134942", "step": 2290, "epoch": 2 }, { "type": "loss", "content": 0.35156911611557007, "timestamp": "2025-09-05 09:06:53.136642", "step": 2291, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:53.303716", "step": 2291, "epoch": 2 }, { "type": "loss", "content": 0.376793771982193, "timestamp": "2025-09-05 09:06:53.319226", "step": 2292, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:53.514148", "step": 2292, "epoch": 2 }, { "type": "loss", "content": 0.38039150834083557, "timestamp": "2025-09-05 09:06:53.515929", "step": 2293, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:53.682172", "step": 2293, "epoch": 2 }, { "type": "loss", "content": 0.29825207591056824, "timestamp": "2025-09-05 09:06:53.684304", "step": 2294, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:53.880490", "step": 2294, "epoch": 2 }, { "type": "loss", "content": 0.31032633781433105, "timestamp": "2025-09-05 09:06:53.882195", "step": 2295, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:06:54.047414", "step": 2295, "epoch": 2 }, { "type": "loss", "content": 0.2269105762243271, "timestamp": "2025-09-05 09:06:54.064076", "step": 2296, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:54.308280", "step": 2296, "epoch": 2 }, { "type": "loss", "content": 0.2711460590362549, "timestamp": "2025-09-05 09:06:54.310952", "step": 2297, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:06:54.522080", "step": 2297, "epoch": 2 }, { "type": "loss", "content": 0.32219910621643066, "timestamp": "2025-09-05 09:06:54.523953", "step": 2298, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:54.730976", "step": 2298, "epoch": 2 }, { "type": "loss", "content": 0.3553353548049927, "timestamp": "2025-09-05 09:06:54.732882", "step": 2299, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:06:54.899746", "step": 2299, "epoch": 2 }, { "type": "loss", "content": 0.3341793119907379, "timestamp": "2025-09-05 09:06:54.915155", "step": 2300, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:06:59.573196", "step": 2300, "epoch": 2 }, { "type": "pplx", "content": 53.83812948418028, "timestamp": "2025-09-05 09:06:59.576460", "step": 2300, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:06:59.738712", "step": 2300, "epoch": 2 }, { "type": "loss", "content": 0.37382030487060547, "timestamp": "2025-09-05 09:06:59.740310", "step": 2301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:06:59.907105", "step": 2301, "epoch": 2 }, { "type": "loss", "content": 0.3155753016471863, "timestamp": "2025-09-05 09:06:59.908756", "step": 2302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:00.113462", "step": 2302, "epoch": 2 }, { "type": "loss", "content": 0.3371022045612335, "timestamp": "2025-09-05 09:07:00.115231", "step": 2303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:00.310344", "step": 2303, "epoch": 2 }, { "type": "loss", "content": 0.3813401162624359, "timestamp": "2025-09-05 09:07:00.324859", "step": 2304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:00.512247", "step": 2304, "epoch": 2 }, { "type": "loss", "content": 0.3163757920265198, "timestamp": "2025-09-05 09:07:00.513868", "step": 2305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:00.709592", "step": 2305, "epoch": 2 }, { "type": "loss", "content": 0.4015030264854431, "timestamp": "2025-09-05 09:07:00.713421", "step": 2306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:00.911204", "step": 2306, "epoch": 2 }, { "type": "loss", "content": 0.241668239235878, "timestamp": "2025-09-05 09:07:00.912870", "step": 2307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:01.112322", "step": 2307, "epoch": 2 }, { "type": "loss", "content": 0.3123008906841278, "timestamp": "2025-09-05 09:07:01.122844", "step": 2308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:01.284697", "step": 2308, "epoch": 2 }, { "type": "loss", "content": 0.22155259549617767, "timestamp": "2025-09-05 09:07:01.286352", "step": 2309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:01.490379", "step": 2309, "epoch": 2 }, { "type": "loss", "content": 0.2877575755119324, "timestamp": "2025-09-05 09:07:01.492069", "step": 2310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:01.696638", "step": 2310, "epoch": 2 }, { "type": "loss", "content": 0.28378915786743164, "timestamp": "2025-09-05 09:07:01.698262", "step": 2311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:01.904919", "step": 2311, "epoch": 2 }, { "type": "loss", "content": 0.305711030960083, "timestamp": "2025-09-05 09:07:01.914270", "step": 2312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:02.076917", "step": 2312, "epoch": 2 }, { "type": "loss", "content": 0.3109692633152008, "timestamp": "2025-09-05 09:07:02.079113", "step": 2313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:02.283526", "step": 2313, "epoch": 2 }, { "type": "loss", "content": 0.24978035688400269, "timestamp": "2025-09-05 09:07:02.285251", "step": 2314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:02.482566", "step": 2314, "epoch": 2 }, { "type": "loss", "content": 0.35233667492866516, "timestamp": "2025-09-05 09:07:02.484199", "step": 2315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:02.679086", "step": 2315, "epoch": 2 }, { "type": "loss", "content": 0.24674127995967865, "timestamp": "2025-09-05 09:07:02.696266", "step": 2316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:02.892813", "step": 2316, "epoch": 2 }, { "type": "loss", "content": 0.3011695444583893, "timestamp": "2025-09-05 09:07:02.894598", "step": 2317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:03.063154", "step": 2317, "epoch": 2 }, { "type": "loss", "content": 0.30131420493125916, "timestamp": "2025-09-05 09:07:03.065608", "step": 2318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:03.271933", "step": 2318, "epoch": 2 }, { "type": "loss", "content": 0.3193568289279938, "timestamp": "2025-09-05 09:07:03.273728", "step": 2319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:03.479048", "step": 2319, "epoch": 2 }, { "type": "loss", "content": 0.2760051488876343, "timestamp": "2025-09-05 09:07:03.493537", "step": 2320, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:07:08.176711", "step": 2320, "epoch": 2 }, { "type": "pplx", "content": 53.84229306421487, "timestamp": "2025-09-05 09:07:08.178871", "step": 2320, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2320", "timestamp": "2025-09-05 09:07:08.636200", "step": 2320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:08.804432", "step": 2320, "epoch": 2 }, { "type": "loss", "content": 0.3730832636356354, "timestamp": "2025-09-05 09:07:08.806721", "step": 2321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:09.011245", "step": 2321, "epoch": 2 }, { "type": "loss", "content": 0.47483229637145996, "timestamp": "2025-09-05 09:07:09.013084", "step": 2322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:09.218081", "step": 2322, "epoch": 2 }, { "type": "loss", "content": 0.26618441939353943, "timestamp": "2025-09-05 09:07:09.219997", "step": 2323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:09.416051", "step": 2323, "epoch": 2 }, { "type": "loss", "content": 0.262174516916275, "timestamp": "2025-09-05 09:07:09.425547", "step": 2324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:09.587653", "step": 2324, "epoch": 2 }, { "type": "loss", "content": 0.43635013699531555, "timestamp": "2025-09-05 09:07:09.589717", "step": 2325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:09.796156", "step": 2325, "epoch": 2 }, { "type": "loss", "content": 0.2052556276321411, "timestamp": "2025-09-05 09:07:09.797976", "step": 2326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:09.967138", "step": 2326, "epoch": 2 }, { "type": "loss", "content": 0.3712131083011627, "timestamp": "2025-09-05 09:07:09.969415", "step": 2327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:10.164137", "step": 2327, "epoch": 2 }, { "type": "loss", "content": 0.35729169845581055, "timestamp": "2025-09-05 09:07:10.178407", "step": 2328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:10.368304", "step": 2328, "epoch": 2 }, { "type": "loss", "content": 0.39892011880874634, "timestamp": "2025-09-05 09:07:10.370520", "step": 2329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:10.567236", "step": 2329, "epoch": 2 }, { "type": "loss", "content": 0.36499282717704773, "timestamp": "2025-09-05 09:07:10.569442", "step": 2330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:10.780649", "step": 2330, "epoch": 2 }, { "type": "loss", "content": 0.23433853685855865, "timestamp": "2025-09-05 09:07:10.782762", "step": 2331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:10.979945", "step": 2331, "epoch": 2 }, { "type": "loss", "content": 0.24312803149223328, "timestamp": "2025-09-05 09:07:10.994537", "step": 2332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:11.183633", "step": 2332, "epoch": 2 }, { "type": "loss", "content": 0.26388394832611084, "timestamp": "2025-09-05 09:07:11.185595", "step": 2333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:11.393159", "step": 2333, "epoch": 2 }, { "type": "loss", "content": 0.4343104660511017, "timestamp": "2025-09-05 09:07:11.395175", "step": 2334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:07:11.593175", "step": 2334, "epoch": 2 }, { "type": "loss", "content": 0.39986181259155273, "timestamp": "2025-09-05 09:07:11.594881", "step": 2335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:11.793473", "step": 2335, "epoch": 2 }, { "type": "loss", "content": 0.48972994089126587, "timestamp": "2025-09-05 09:07:11.802839", "step": 2336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:11.967517", "step": 2336, "epoch": 2 }, { "type": "loss", "content": 0.2836100459098816, "timestamp": "2025-09-05 09:07:11.969198", "step": 2337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:12.174955", "step": 2337, "epoch": 2 }, { "type": "loss", "content": 0.2238231599330902, "timestamp": "2025-09-05 09:07:12.177197", "step": 2338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:07:12.374452", "step": 2338, "epoch": 2 }, { "type": "loss", "content": 0.3275935649871826, "timestamp": "2025-09-05 09:07:12.377033", "step": 2339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:12.573386", "step": 2339, "epoch": 2 }, { "type": "loss", "content": 0.31032243371009827, "timestamp": "2025-09-05 09:07:12.587393", "step": 2340, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:07:17.348659", "step": 2340, "epoch": 2 }, { "type": "pplx", "content": 54.3609793507131, "timestamp": "2025-09-05 09:07:17.350682", "step": 2340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:17.511929", "step": 2340, "epoch": 2 }, { "type": "loss", "content": 0.37032943964004517, "timestamp": "2025-09-05 09:07:17.514323", "step": 2341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:17.721157", "step": 2341, "epoch": 2 }, { "type": "loss", "content": 0.3330005407333374, "timestamp": "2025-09-05 09:07:17.723262", "step": 2342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:17.919750", "step": 2342, "epoch": 2 }, { "type": "loss", "content": 0.26726004481315613, "timestamp": "2025-09-05 09:07:17.922084", "step": 2343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:18.129709", "step": 2343, "epoch": 2 }, { "type": "loss", "content": 0.2254151552915573, "timestamp": "2025-09-05 09:07:18.139944", "step": 2344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:18.302073", "step": 2344, "epoch": 2 }, { "type": "loss", "content": 0.2709170877933502, "timestamp": "2025-09-05 09:07:18.306175", "step": 2345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:18.520095", "step": 2345, "epoch": 2 }, { "type": "loss", "content": 0.36739596724510193, "timestamp": "2025-09-05 09:07:18.526131", "step": 2346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:18.693682", "step": 2346, "epoch": 2 }, { "type": "loss", "content": 0.24793685972690582, "timestamp": "2025-09-05 09:07:18.699367", "step": 2347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:18.908254", "step": 2347, "epoch": 2 }, { "type": "loss", "content": 0.25923240184783936, "timestamp": "2025-09-05 09:07:18.917770", "step": 2348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:19.083149", "step": 2348, "epoch": 2 }, { "type": "loss", "content": 0.4075714349746704, "timestamp": "2025-09-05 09:07:19.086080", "step": 2349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:19.253820", "step": 2349, "epoch": 2 }, { "type": "loss", "content": 0.22864657640457153, "timestamp": "2025-09-05 09:07:19.269288", "step": 2350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:19.517188", "step": 2350, "epoch": 2 }, { "type": "loss", "content": 0.2772950828075409, "timestamp": "2025-09-05 09:07:19.520982", "step": 2351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:19.693977", "step": 2351, "epoch": 2 }, { "type": "loss", "content": 0.237502783536911, "timestamp": "2025-09-05 09:07:19.710552", "step": 2352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:19.910056", "step": 2352, "epoch": 2 }, { "type": "loss", "content": 0.3322488069534302, "timestamp": "2025-09-05 09:07:19.917125", "step": 2353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:20.116870", "step": 2353, "epoch": 2 }, { "type": "loss", "content": 0.39339199662208557, "timestamp": "2025-09-05 09:07:20.119561", "step": 2354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:20.327031", "step": 2354, "epoch": 2 }, { "type": "loss", "content": 0.31205451488494873, "timestamp": "2025-09-05 09:07:20.331019", "step": 2355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:20.540792", "step": 2355, "epoch": 2 }, { "type": "loss", "content": 0.19401071965694427, "timestamp": "2025-09-05 09:07:20.550029", "step": 2356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:20.714882", "step": 2356, "epoch": 2 }, { "type": "loss", "content": 0.27951422333717346, "timestamp": "2025-09-05 09:07:20.716627", "step": 2357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:20.924647", "step": 2357, "epoch": 2 }, { "type": "loss", "content": 0.3359452486038208, "timestamp": "2025-09-05 09:07:20.926530", "step": 2358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:21.094918", "step": 2358, "epoch": 2 }, { "type": "loss", "content": 0.32733088731765747, "timestamp": "2025-09-05 09:07:21.096900", "step": 2359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:21.294512", "step": 2359, "epoch": 2 }, { "type": "loss", "content": 0.27296555042266846, "timestamp": "2025-09-05 09:07:21.304367", "step": 2360, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:07:26.011972", "step": 2360, "epoch": 2 }, { "type": "pplx", "content": 55.69164641376557, "timestamp": "2025-09-05 09:07:26.014599", "step": 2360, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2360", "timestamp": "2025-09-05 09:07:26.471307", "step": 2360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:07:26.639440", "step": 2360, "epoch": 2 }, { "type": "loss", "content": 0.3209402859210968, "timestamp": "2025-09-05 09:07:26.641714", "step": 2361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:26.845836", "step": 2361, "epoch": 2 }, { "type": "loss", "content": 0.2272927612066269, "timestamp": "2025-09-05 09:07:26.848505", "step": 2362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:27.045204", "step": 2362, "epoch": 2 }, { "type": "loss", "content": 0.3490237295627594, "timestamp": "2025-09-05 09:07:27.047647", "step": 2363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:27.253023", "step": 2363, "epoch": 2 }, { "type": "loss", "content": 0.3624802231788635, "timestamp": "2025-09-05 09:07:27.267156", "step": 2364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:27.457605", "step": 2364, "epoch": 2 }, { "type": "loss", "content": 0.36499476432800293, "timestamp": "2025-09-05 09:07:27.459374", "step": 2365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:27.655960", "step": 2365, "epoch": 2 }, { "type": "loss", "content": 0.28135761618614197, "timestamp": "2025-09-05 09:07:27.658541", "step": 2366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:27.858711", "step": 2366, "epoch": 2 }, { "type": "loss", "content": 0.345076322555542, "timestamp": "2025-09-05 09:07:27.860970", "step": 2367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:28.059087", "step": 2367, "epoch": 2 }, { "type": "loss", "content": 0.41919395327568054, "timestamp": "2025-09-05 09:07:28.068579", "step": 2368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:07:28.231677", "step": 2368, "epoch": 2 }, { "type": "loss", "content": 0.2436763346195221, "timestamp": "2025-09-05 09:07:28.233668", "step": 2369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:28.399911", "step": 2369, "epoch": 2 }, { "type": "loss", "content": 0.23662683367729187, "timestamp": "2025-09-05 09:07:28.402152", "step": 2370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:28.597574", "step": 2370, "epoch": 2 }, { "type": "loss", "content": 0.22401820123195648, "timestamp": "2025-09-05 09:07:28.599367", "step": 2371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:07:28.765558", "step": 2371, "epoch": 2 }, { "type": "loss", "content": 0.23337076604366302, "timestamp": "2025-09-05 09:07:28.782887", "step": 2372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:28.977131", "step": 2372, "epoch": 2 }, { "type": "loss", "content": 0.3868841230869293, "timestamp": "2025-09-05 09:07:28.979493", "step": 2373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:29.176816", "step": 2373, "epoch": 2 }, { "type": "loss", "content": 0.47699031233787537, "timestamp": "2025-09-05 09:07:29.184646", "step": 2374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:29.396413", "step": 2374, "epoch": 2 }, { "type": "loss", "content": 0.437484472990036, "timestamp": "2025-09-05 09:07:29.398070", "step": 2375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:29.603081", "step": 2375, "epoch": 2 }, { "type": "loss", "content": 0.22171063721179962, "timestamp": "2025-09-05 09:07:29.611960", "step": 2376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:29.772947", "step": 2376, "epoch": 2 }, { "type": "loss", "content": 0.2754736840724945, "timestamp": "2025-09-05 09:07:29.774820", "step": 2377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:29.976846", "step": 2377, "epoch": 2 }, { "type": "loss", "content": 0.2224128544330597, "timestamp": "2025-09-05 09:07:29.978350", "step": 2378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:30.142774", "step": 2378, "epoch": 2 }, { "type": "loss", "content": 0.26598674058914185, "timestamp": "2025-09-05 09:07:30.144652", "step": 2379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:30.338071", "step": 2379, "epoch": 2 }, { "type": "loss", "content": 0.31558096408843994, "timestamp": "2025-09-05 09:07:30.347353", "step": 2380, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:07:35.003062", "step": 2380, "epoch": 2 }, { "type": "pplx", "content": 54.91370123427139, "timestamp": "2025-09-05 09:07:35.005288", "step": 2380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:35.166638", "step": 2380, "epoch": 2 }, { "type": "loss", "content": 0.3278944790363312, "timestamp": "2025-09-05 09:07:35.168373", "step": 2381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:35.334452", "step": 2381, "epoch": 2 }, { "type": "loss", "content": 0.3350487947463989, "timestamp": "2025-09-05 09:07:35.336389", "step": 2382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:35.502370", "step": 2382, "epoch": 2 }, { "type": "loss", "content": 0.2530112862586975, "timestamp": "2025-09-05 09:07:35.504209", "step": 2383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:35.671250", "step": 2383, "epoch": 2 }, { "type": "loss", "content": 0.34709376096725464, "timestamp": "2025-09-05 09:07:35.680926", "step": 2384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:35.845859", "step": 2384, "epoch": 2 }, { "type": "loss", "content": 0.2088186889886856, "timestamp": "2025-09-05 09:07:35.847816", "step": 2385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:36.013801", "step": 2385, "epoch": 2 }, { "type": "loss", "content": 0.2869564890861511, "timestamp": "2025-09-05 09:07:36.015947", "step": 2386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:36.181958", "step": 2386, "epoch": 2 }, { "type": "loss", "content": 0.3361554741859436, "timestamp": "2025-09-05 09:07:36.184163", "step": 2387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:36.352174", "step": 2387, "epoch": 2 }, { "type": "loss", "content": 0.35683155059814453, "timestamp": "2025-09-05 09:07:36.361191", "step": 2388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:36.525010", "step": 2388, "epoch": 2 }, { "type": "loss", "content": 0.4390067756175995, "timestamp": "2025-09-05 09:07:36.526684", "step": 2389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:07:36.691073", "step": 2389, "epoch": 2 }, { "type": "loss", "content": 0.3709789514541626, "timestamp": "2025-09-05 09:07:36.693111", "step": 2390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:36.862890", "step": 2390, "epoch": 2 }, { "type": "loss", "content": 0.2603262960910797, "timestamp": "2025-09-05 09:07:36.865311", "step": 2391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:37.031735", "step": 2391, "epoch": 2 }, { "type": "loss", "content": 0.23307450115680695, "timestamp": "2025-09-05 09:07:37.041183", "step": 2392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:37.204920", "step": 2392, "epoch": 2 }, { "type": "loss", "content": 0.27466168999671936, "timestamp": "2025-09-05 09:07:37.206775", "step": 2393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:07:37.373885", "step": 2393, "epoch": 2 }, { "type": "loss", "content": 0.2780590355396271, "timestamp": "2025-09-05 09:07:37.375972", "step": 2394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:37.542987", "step": 2394, "epoch": 2 }, { "type": "loss", "content": 0.17993779480457306, "timestamp": "2025-09-05 09:07:37.545373", "step": 2395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:37.711127", "step": 2395, "epoch": 2 }, { "type": "loss", "content": 0.33172348141670227, "timestamp": "2025-09-05 09:07:37.777082", "step": 2396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:37.982581", "step": 2396, "epoch": 2 }, { "type": "loss", "content": 0.23976249992847443, "timestamp": "2025-09-05 09:07:37.985053", "step": 2397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:38.166172", "step": 2397, "epoch": 2 }, { "type": "loss", "content": 0.3238065540790558, "timestamp": "2025-09-05 09:07:38.168100", "step": 2398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:38.327669", "step": 2398, "epoch": 2 }, { "type": "loss", "content": 0.2310824692249298, "timestamp": "2025-09-05 09:07:38.329756", "step": 2399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:38.488943", "step": 2399, "epoch": 2 }, { "type": "loss", "content": 0.13710328936576843, "timestamp": "2025-09-05 09:07:38.503058", "step": 2400, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:07:43.150193", "step": 2400, "epoch": 2 }, { "type": "pplx", "content": 53.65770720978935, "timestamp": "2025-09-05 09:07:43.152234", "step": 2400, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2400", "timestamp": "2025-09-05 09:07:43.622167", "step": 2400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:43.756497", "step": 2400, "epoch": 2 }, { "type": "loss", "content": 0.3784136474132538, "timestamp": "2025-09-05 09:07:43.758846", "step": 2401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:43.928455", "step": 2401, "epoch": 2 }, { "type": "loss", "content": 0.28840699791908264, "timestamp": "2025-09-05 09:07:43.930366", "step": 2402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:44.100558", "step": 2402, "epoch": 2 }, { "type": "loss", "content": 0.12435296177864075, "timestamp": "2025-09-05 09:07:44.103005", "step": 2403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:44.339719", "step": 2403, "epoch": 2 }, { "type": "loss", "content": 0.19681872427463531, "timestamp": "2025-09-05 09:07:44.355064", "step": 2404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:44.509599", "step": 2404, "epoch": 2 }, { "type": "loss", "content": 0.29926079511642456, "timestamp": "2025-09-05 09:07:44.511796", "step": 2405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:44.682182", "step": 2405, "epoch": 2 }, { "type": "loss", "content": 0.39890509843826294, "timestamp": "2025-09-05 09:07:44.684493", "step": 2406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:44.855392", "step": 2406, "epoch": 2 }, { "type": "loss", "content": 0.34845206141471863, "timestamp": "2025-09-05 09:07:44.857439", "step": 2407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 5440033091648.0 }, "timestamp": "2025-09-05 09:07:45.021834", "step": 2407, "epoch": 2 }, { "type": "loss", "content": 0.4749622046947479, "timestamp": "2025-09-05 09:07:45.036535", "step": 2408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:45.289008", "step": 2408, "epoch": 2 }, { "type": "loss", "content": 0.17726445198059082, "timestamp": "2025-09-05 09:07:45.314028", "step": 2409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:45.497454", "step": 2409, "epoch": 2 }, { "type": "loss", "content": 0.3263298571109772, "timestamp": "2025-09-05 09:07:45.514542", "step": 2410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:45.745066", "step": 2410, "epoch": 2 }, { "type": "loss", "content": 0.2775588929653168, "timestamp": "2025-09-05 09:07:45.747377", "step": 2411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:45.919583", "step": 2411, "epoch": 2 }, { "type": "loss", "content": 0.3065100610256195, "timestamp": "2025-09-05 09:07:45.975180", "step": 2412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:46.147292", "step": 2412, "epoch": 2 }, { "type": "loss", "content": 0.28606003522872925, "timestamp": "2025-09-05 09:07:46.149413", "step": 2413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:46.320168", "step": 2413, "epoch": 2 }, { "type": "loss", "content": 0.2223246544599533, "timestamp": "2025-09-05 09:07:46.322268", "step": 2414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:46.481163", "step": 2414, "epoch": 2 }, { "type": "loss", "content": 0.4483068585395813, "timestamp": "2025-09-05 09:07:46.483191", "step": 2415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:46.678206", "step": 2415, "epoch": 2 }, { "type": "loss", "content": 0.31489551067352295, "timestamp": "2025-09-05 09:07:46.692575", "step": 2416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:07:46.844050", "step": 2416, "epoch": 2 }, { "type": "loss", "content": 0.2865988612174988, "timestamp": "2025-09-05 09:07:46.846269", "step": 2417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:47.012544", "step": 2417, "epoch": 2 }, { "type": "loss", "content": 0.3174918293952942, "timestamp": "2025-09-05 09:07:47.014872", "step": 2418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:47.186121", "step": 2418, "epoch": 2 }, { "type": "loss", "content": 0.26380813121795654, "timestamp": "2025-09-05 09:07:47.188686", "step": 2419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:47.360860", "step": 2419, "epoch": 2 }, { "type": "loss", "content": 0.25339600443840027, "timestamp": "2025-09-05 09:07:47.375096", "step": 2420, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:07:52.646084", "step": 2420, "epoch": 2 }, { "type": "pplx", "content": 54.45162833560751, "timestamp": "2025-09-05 09:07:52.648300", "step": 2420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:07:52.779251", "step": 2420, "epoch": 2 }, { "type": "loss", "content": 0.21093280613422394, "timestamp": "2025-09-05 09:07:52.781980", "step": 2421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:52.938785", "step": 2421, "epoch": 2 }, { "type": "loss", "content": 0.24759776890277863, "timestamp": "2025-09-05 09:07:53.020176", "step": 2422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:53.213342", "step": 2422, "epoch": 2 }, { "type": "loss", "content": 0.19092871248722076, "timestamp": "2025-09-05 09:07:53.215194", "step": 2423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:53.382864", "step": 2423, "epoch": 2 }, { "type": "loss", "content": 0.38986799120903015, "timestamp": "2025-09-05 09:07:53.396260", "step": 2424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:53.548122", "step": 2424, "epoch": 2 }, { "type": "loss", "content": 0.25403735041618347, "timestamp": "2025-09-05 09:07:53.550389", "step": 2425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:53.707715", "step": 2425, "epoch": 2 }, { "type": "loss", "content": 0.39463475346565247, "timestamp": "2025-09-05 09:07:53.736535", "step": 2426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:07:53.949730", "step": 2426, "epoch": 2 }, { "type": "loss", "content": 0.27436649799346924, "timestamp": "2025-09-05 09:07:53.982038", "step": 2427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:54.210833", "step": 2427, "epoch": 2 }, { "type": "loss", "content": 0.3307807147502899, "timestamp": "2025-09-05 09:07:54.225106", "step": 2428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:54.380225", "step": 2428, "epoch": 2 }, { "type": "loss", "content": 0.5084760785102844, "timestamp": "2025-09-05 09:07:54.382463", "step": 2429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:54.552231", "step": 2429, "epoch": 2 }, { "type": "loss", "content": 0.2736986577510834, "timestamp": "2025-09-05 09:07:54.554064", "step": 2430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:54.725829", "step": 2430, "epoch": 2 }, { "type": "loss", "content": 0.2925279140472412, "timestamp": "2025-09-05 09:07:54.728433", "step": 2431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:07:54.886581", "step": 2431, "epoch": 2 }, { "type": "loss", "content": 0.27017101645469666, "timestamp": "2025-09-05 09:07:54.903369", "step": 2432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:55.063357", "step": 2432, "epoch": 2 }, { "type": "loss", "content": 0.2796265780925751, "timestamp": "2025-09-05 09:07:55.066444", "step": 2433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:55.224425", "step": 2433, "epoch": 2 }, { "type": "loss", "content": 0.35782158374786377, "timestamp": "2025-09-05 09:07:55.227028", "step": 2434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:07:55.364722", "step": 2434, "epoch": 2 }, { "type": "loss", "content": 0.3305521607398987, "timestamp": "2025-09-05 09:07:55.366733", "step": 2435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:55.538296", "step": 2435, "epoch": 2 }, { "type": "loss", "content": 0.3081381618976593, "timestamp": "2025-09-05 09:07:55.555184", "step": 2436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:55.716234", "step": 2436, "epoch": 2 }, { "type": "loss", "content": 0.3570384681224823, "timestamp": "2025-09-05 09:07:55.718614", "step": 2437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:07:55.888053", "step": 2437, "epoch": 2 }, { "type": "loss", "content": 0.15754249691963196, "timestamp": "2025-09-05 09:07:55.890184", "step": 2438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:07:56.056501", "step": 2438, "epoch": 2 }, { "type": "loss", "content": 0.20244644582271576, "timestamp": "2025-09-05 09:07:56.058763", "step": 2439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:07:56.216598", "step": 2439, "epoch": 2 }, { "type": "loss", "content": 0.2593131959438324, "timestamp": "2025-09-05 09:07:56.230745", "step": 2440, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:08:01.348684", "step": 2440, "epoch": 2 }, { "type": "pplx", "content": 54.726233382312806, "timestamp": "2025-09-05 09:08:01.352558", "step": 2440, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2440", "timestamp": "2025-09-05 09:08:01.916075", "step": 2440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:02.075987", "step": 2440, "epoch": 2 }, { "type": "loss", "content": 0.22823497653007507, "timestamp": "2025-09-05 09:08:02.078058", "step": 2441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:02.251582", "step": 2441, "epoch": 2 }, { "type": "loss", "content": 0.13066086173057556, "timestamp": "2025-09-05 09:08:02.254006", "step": 2442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:02.418800", "step": 2442, "epoch": 2 }, { "type": "loss", "content": 0.3410320580005646, "timestamp": "2025-09-05 09:08:02.420955", "step": 2443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:02.586353", "step": 2443, "epoch": 2 }, { "type": "loss", "content": 0.3090682327747345, "timestamp": "2025-09-05 09:08:02.600282", "step": 2444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:02.757752", "step": 2444, "epoch": 2 }, { "type": "loss", "content": 0.3452575206756592, "timestamp": "2025-09-05 09:08:02.760185", "step": 2445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:02.927518", "step": 2445, "epoch": 2 }, { "type": "loss", "content": 0.31340721249580383, "timestamp": "2025-09-05 09:08:02.929857", "step": 2446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:03.190905", "step": 2446, "epoch": 2 }, { "type": "loss", "content": 0.3519221544265747, "timestamp": "2025-09-05 09:08:03.192880", "step": 2447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:08:03.367264", "step": 2447, "epoch": 2 }, { "type": "loss", "content": 0.5222123265266418, "timestamp": "2025-09-05 09:08:03.381834", "step": 2448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:03.540496", "step": 2448, "epoch": 2 }, { "type": "loss", "content": 0.25161540508270264, "timestamp": "2025-09-05 09:08:03.542795", "step": 2449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:03.709099", "step": 2449, "epoch": 2 }, { "type": "loss", "content": 0.28207287192344666, "timestamp": "2025-09-05 09:08:03.752015", "step": 2450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:04.036556", "step": 2450, "epoch": 2 }, { "type": "loss", "content": 0.4279467761516571, "timestamp": "2025-09-05 09:08:04.039030", "step": 2451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:04.354549", "step": 2451, "epoch": 2 }, { "type": "loss", "content": 0.32319822907447815, "timestamp": "2025-09-05 09:08:04.369029", "step": 2452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:04.538604", "step": 2452, "epoch": 2 }, { "type": "loss", "content": 0.27443209290504456, "timestamp": "2025-09-05 09:08:04.598961", "step": 2453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:04.818752", "step": 2453, "epoch": 2 }, { "type": "loss", "content": 0.2456836998462677, "timestamp": "2025-09-05 09:08:04.821148", "step": 2454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:04.988344", "step": 2454, "epoch": 2 }, { "type": "loss", "content": 0.35845696926116943, "timestamp": "2025-09-05 09:08:04.990202", "step": 2455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:05.154815", "step": 2455, "epoch": 2 }, { "type": "loss", "content": 0.40192878246307373, "timestamp": "2025-09-05 09:08:05.171087", "step": 2456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:05.339221", "step": 2456, "epoch": 2 }, { "type": "loss", "content": 0.38854965567588806, "timestamp": "2025-09-05 09:08:05.342153", "step": 2457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:05.506686", "step": 2457, "epoch": 2 }, { "type": "loss", "content": 0.36593395471572876, "timestamp": "2025-09-05 09:08:05.508864", "step": 2458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:05.673878", "step": 2458, "epoch": 2 }, { "type": "loss", "content": 0.3558920621871948, "timestamp": "2025-09-05 09:08:05.676132", "step": 2459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:08:05.841507", "step": 2459, "epoch": 2 }, { "type": "loss", "content": 0.23418471217155457, "timestamp": "2025-09-05 09:08:05.858314", "step": 2460, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:08:10.996179", "step": 2460, "epoch": 2 }, { "type": "pplx", "content": 53.48732085430296, "timestamp": "2025-09-05 09:08:11.039993", "step": 2460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:11.173071", "step": 2460, "epoch": 2 }, { "type": "loss", "content": 0.2769435942173004, "timestamp": "2025-09-05 09:08:11.189561", "step": 2461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:11.452212", "step": 2461, "epoch": 2 }, { "type": "loss", "content": 0.44493263959884644, "timestamp": "2025-09-05 09:08:11.454879", "step": 2462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:11.710370", "step": 2462, "epoch": 2 }, { "type": "loss", "content": 0.274783194065094, "timestamp": "2025-09-05 09:08:11.712867", "step": 2463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:11.932413", "step": 2463, "epoch": 2 }, { "type": "loss", "content": 0.2961232364177704, "timestamp": "2025-09-05 09:08:11.945952", "step": 2464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:12.106328", "step": 2464, "epoch": 2 }, { "type": "loss", "content": 0.19862420856952667, "timestamp": "2025-09-05 09:08:12.108278", "step": 2465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:12.286236", "step": 2465, "epoch": 2 }, { "type": "loss", "content": 0.49661245942115784, "timestamp": "2025-09-05 09:08:12.288268", "step": 2466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:12.423310", "step": 2466, "epoch": 2 }, { "type": "loss", "content": 0.3499586284160614, "timestamp": "2025-09-05 09:08:12.425494", "step": 2467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:12.590694", "step": 2467, "epoch": 2 }, { "type": "loss", "content": 0.3866555392742157, "timestamp": "2025-09-05 09:08:12.604767", "step": 2468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:12.762857", "step": 2468, "epoch": 2 }, { "type": "loss", "content": 0.15350095927715302, "timestamp": "2025-09-05 09:08:12.765656", "step": 2469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:12.930539", "step": 2469, "epoch": 2 }, { "type": "loss", "content": 0.2772982716560364, "timestamp": "2025-09-05 09:08:12.933067", "step": 2470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:13.150777", "step": 2470, "epoch": 2 }, { "type": "loss", "content": 0.2272377461194992, "timestamp": "2025-09-05 09:08:13.153293", "step": 2471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:13.323513", "step": 2471, "epoch": 2 }, { "type": "loss", "content": 0.3104439973831177, "timestamp": "2025-09-05 09:08:13.339831", "step": 2472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:13.506609", "step": 2472, "epoch": 2 }, { "type": "loss", "content": 0.3005099892616272, "timestamp": "2025-09-05 09:08:13.509163", "step": 2473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:13.728381", "step": 2473, "epoch": 2 }, { "type": "loss", "content": 0.32455500960350037, "timestamp": "2025-09-05 09:08:13.730644", "step": 2474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:13.971396", "step": 2474, "epoch": 2 }, { "type": "loss", "content": 0.3721053898334503, "timestamp": "2025-09-05 09:08:13.995417", "step": 2475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:14.176746", "step": 2475, "epoch": 2 }, { "type": "loss", "content": 0.33817243576049805, "timestamp": "2025-09-05 09:08:14.194035", "step": 2476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:14.364770", "step": 2476, "epoch": 2 }, { "type": "loss", "content": 0.4381646513938904, "timestamp": "2025-09-05 09:08:14.367767", "step": 2477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:14.586996", "step": 2477, "epoch": 2 }, { "type": "loss", "content": 0.45706042647361755, "timestamp": "2025-09-05 09:08:14.589275", "step": 2478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:14.811775", "step": 2478, "epoch": 2 }, { "type": "loss", "content": 0.3688715994358063, "timestamp": "2025-09-05 09:08:14.813672", "step": 2479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:15.011990", "step": 2479, "epoch": 2 }, { "type": "loss", "content": 0.365582138299942, "timestamp": "2025-09-05 09:08:15.025519", "step": 2480, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:08:20.267144", "step": 2480, "epoch": 2 }, { "type": "pplx", "content": 52.49038722586114, "timestamp": "2025-09-05 09:08:20.269130", "step": 2480, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2480", "timestamp": "2025-09-05 09:08:20.748386", "step": 2480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:20.908304", "step": 2480, "epoch": 2 }, { "type": "loss", "content": 0.3301137685775757, "timestamp": "2025-09-05 09:08:20.911345", "step": 2481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:21.048283", "step": 2481, "epoch": 2 }, { "type": "loss", "content": 0.2704985737800598, "timestamp": "2025-09-05 09:08:21.051353", "step": 2482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:21.272855", "step": 2482, "epoch": 2 }, { "type": "loss", "content": 0.24071909487247467, "timestamp": "2025-09-05 09:08:21.316201", "step": 2483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:08:21.495818", "step": 2483, "epoch": 2 }, { "type": "loss", "content": 0.27359169721603394, "timestamp": "2025-09-05 09:08:21.585302", "step": 2484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:21.776756", "step": 2484, "epoch": 2 }, { "type": "loss", "content": 0.30878910422325134, "timestamp": "2025-09-05 09:08:21.779385", "step": 2485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:21.944264", "step": 2485, "epoch": 2 }, { "type": "loss", "content": 0.3247639834880829, "timestamp": "2025-09-05 09:08:21.986129", "step": 2486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:22.212722", "step": 2486, "epoch": 2 }, { "type": "loss", "content": 0.2619643211364746, "timestamp": "2025-09-05 09:08:22.233914", "step": 2487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:22.458476", "step": 2487, "epoch": 2 }, { "type": "loss", "content": 0.26061657071113586, "timestamp": "2025-09-05 09:08:22.472954", "step": 2488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:22.632179", "step": 2488, "epoch": 2 }, { "type": "loss", "content": 0.3099912106990814, "timestamp": "2025-09-05 09:08:22.635510", "step": 2489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:22.799538", "step": 2489, "epoch": 2 }, { "type": "loss", "content": 0.21963852643966675, "timestamp": "2025-09-05 09:08:22.802163", "step": 2490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:22.978088", "step": 2490, "epoch": 2 }, { "type": "loss", "content": 0.3000223636627197, "timestamp": "2025-09-05 09:08:22.981287", "step": 2491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:23.157663", "step": 2491, "epoch": 2 }, { "type": "loss", "content": 0.3209587335586548, "timestamp": "2025-09-05 09:08:23.174124", "step": 2492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:23.342854", "step": 2492, "epoch": 2 }, { "type": "loss", "content": 0.33791443705558777, "timestamp": "2025-09-05 09:08:23.346174", "step": 2493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:23.510842", "step": 2493, "epoch": 2 }, { "type": "loss", "content": 0.5459884405136108, "timestamp": "2025-09-05 09:08:23.513096", "step": 2494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:08:23.679341", "step": 2494, "epoch": 2 }, { "type": "loss", "content": 0.2700771987438202, "timestamp": "2025-09-05 09:08:23.681816", "step": 2495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:23.847783", "step": 2495, "epoch": 2 }, { "type": "loss", "content": 0.30430659651756287, "timestamp": "2025-09-05 09:08:23.858129", "step": 2496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:23.992946", "step": 2496, "epoch": 2 }, { "type": "loss", "content": 0.26771894097328186, "timestamp": "2025-09-05 09:08:23.995222", "step": 2497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:24.257840", "step": 2497, "epoch": 2 }, { "type": "loss", "content": 0.32667091488838196, "timestamp": "2025-09-05 09:08:24.300330", "step": 2498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:24.522074", "step": 2498, "epoch": 2 }, { "type": "loss", "content": 0.29449260234832764, "timestamp": "2025-09-05 09:08:24.524635", "step": 2499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:24.689395", "step": 2499, "epoch": 2 }, { "type": "loss", "content": 0.2695910930633545, "timestamp": "2025-09-05 09:08:24.703400", "step": 2500, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:08:30.131163", "step": 2500, "epoch": 2 }, { "type": "pplx", "content": 52.813743962987445, "timestamp": "2025-09-05 09:08:30.133472", "step": 2500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:30.266354", "step": 2500, "epoch": 2 }, { "type": "loss", "content": 0.33899521827697754, "timestamp": "2025-09-05 09:08:30.268500", "step": 2501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:30.406032", "step": 2501, "epoch": 2 }, { "type": "loss", "content": 0.4425645172595978, "timestamp": "2025-09-05 09:08:30.409138", "step": 2502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:30.730139", "step": 2502, "epoch": 2 }, { "type": "loss", "content": 0.3839690387248993, "timestamp": "2025-09-05 09:08:30.732271", "step": 2503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:30.944016", "step": 2503, "epoch": 2 }, { "type": "loss", "content": 0.28737911581993103, "timestamp": "2025-09-05 09:08:30.958478", "step": 2504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:31.147199", "step": 2504, "epoch": 2 }, { "type": "loss", "content": 0.5052091479301453, "timestamp": "2025-09-05 09:08:31.171139", "step": 2505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:31.513127", "step": 2505, "epoch": 2 }, { "type": "loss", "content": 0.34135702252388, "timestamp": "2025-09-05 09:08:31.515377", "step": 2506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:31.720097", "step": 2506, "epoch": 2 }, { "type": "loss", "content": 0.22179536521434784, "timestamp": "2025-09-05 09:08:31.722369", "step": 2507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:31.890591", "step": 2507, "epoch": 2 }, { "type": "loss", "content": 0.3495122790336609, "timestamp": "2025-09-05 09:08:31.907828", "step": 2508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:32.101530", "step": 2508, "epoch": 2 }, { "type": "loss", "content": 0.252922922372818, "timestamp": "2025-09-05 09:08:32.105574", "step": 2509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:32.310055", "step": 2509, "epoch": 2 }, { "type": "loss", "content": 0.2928535044193268, "timestamp": "2025-09-05 09:08:32.312376", "step": 2510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:32.561305", "step": 2510, "epoch": 2 }, { "type": "loss", "content": 0.3097461462020874, "timestamp": "2025-09-05 09:08:32.564710", "step": 2511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:08:32.772402", "step": 2511, "epoch": 2 }, { "type": "loss", "content": 0.3948187232017517, "timestamp": "2025-09-05 09:08:32.789068", "step": 2512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:33.031818", "step": 2512, "epoch": 2 }, { "type": "loss", "content": 0.2657097578048706, "timestamp": "2025-09-05 09:08:33.034507", "step": 2513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:33.241556", "step": 2513, "epoch": 2 }, { "type": "loss", "content": 0.3475334346294403, "timestamp": "2025-09-05 09:08:33.244226", "step": 2514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:33.451829", "step": 2514, "epoch": 2 }, { "type": "loss", "content": 0.2843340039253235, "timestamp": "2025-09-05 09:08:33.453852", "step": 2515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:33.658405", "step": 2515, "epoch": 2 }, { "type": "loss", "content": 0.4441480338573456, "timestamp": "2025-09-05 09:08:33.673480", "step": 2516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:33.862983", "step": 2516, "epoch": 2 }, { "type": "loss", "content": 0.24662137031555176, "timestamp": "2025-09-05 09:08:33.865600", "step": 2517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:34.080449", "step": 2517, "epoch": 2 }, { "type": "loss", "content": 0.40211036801338196, "timestamp": "2025-09-05 09:08:34.082800", "step": 2518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:34.371690", "step": 2518, "epoch": 2 }, { "type": "loss", "content": 0.4713238477706909, "timestamp": "2025-09-05 09:08:34.374322", "step": 2519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:34.571969", "step": 2519, "epoch": 2 }, { "type": "loss", "content": 0.3440394401550293, "timestamp": "2025-09-05 09:08:34.587486", "step": 2520, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:08:40.157649", "step": 2520, "epoch": 2 }, { "type": "pplx", "content": 53.0074163999356, "timestamp": "2025-09-05 09:08:40.159389", "step": 2520, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2520", "timestamp": "2025-09-05 09:08:40.670948", "step": 2520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:40.870996", "step": 2520, "epoch": 2 }, { "type": "loss", "content": 0.2662496566772461, "timestamp": "2025-09-05 09:08:40.873982", "step": 2521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:41.068057", "step": 2521, "epoch": 2 }, { "type": "loss", "content": 0.2788788974285126, "timestamp": "2025-09-05 09:08:41.070930", "step": 2522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:41.236468", "step": 2522, "epoch": 2 }, { "type": "loss", "content": 0.2379118651151657, "timestamp": "2025-09-05 09:08:41.240108", "step": 2523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:41.445747", "step": 2523, "epoch": 2 }, { "type": "loss", "content": 0.2710990309715271, "timestamp": "2025-09-05 09:08:41.460173", "step": 2524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:41.647392", "step": 2524, "epoch": 2 }, { "type": "loss", "content": 0.35253018140792847, "timestamp": "2025-09-05 09:08:41.649673", "step": 2525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:41.899382", "step": 2525, "epoch": 2 }, { "type": "loss", "content": 0.33853983879089355, "timestamp": "2025-09-05 09:08:41.903036", "step": 2526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:42.152763", "step": 2526, "epoch": 2 }, { "type": "loss", "content": 0.2997421324253082, "timestamp": "2025-09-05 09:08:42.195973", "step": 2527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:08:42.443160", "step": 2527, "epoch": 2 }, { "type": "loss", "content": 0.22982150316238403, "timestamp": "2025-09-05 09:08:42.460245", "step": 2528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:42.657316", "step": 2528, "epoch": 2 }, { "type": "loss", "content": 0.27433791756629944, "timestamp": "2025-09-05 09:08:42.661098", "step": 2529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:42.859379", "step": 2529, "epoch": 2 }, { "type": "loss", "content": 0.4138650894165039, "timestamp": "2025-09-05 09:08:42.861980", "step": 2530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:43.057651", "step": 2530, "epoch": 2 }, { "type": "loss", "content": 0.5147197842597961, "timestamp": "2025-09-05 09:08:43.061658", "step": 2531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:43.258384", "step": 2531, "epoch": 2 }, { "type": "loss", "content": 0.3644541800022125, "timestamp": "2025-09-05 09:08:43.277457", "step": 2532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:43.595788", "step": 2532, "epoch": 2 }, { "type": "loss", "content": 0.21938161551952362, "timestamp": "2025-09-05 09:08:43.598479", "step": 2533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:43.796759", "step": 2533, "epoch": 2 }, { "type": "loss", "content": 0.3870631754398346, "timestamp": "2025-09-05 09:08:43.799743", "step": 2534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:43.999027", "step": 2534, "epoch": 2 }, { "type": "loss", "content": 0.1749730259180069, "timestamp": "2025-09-05 09:08:44.002043", "step": 2535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:44.198000", "step": 2535, "epoch": 2 }, { "type": "loss", "content": 0.2818840742111206, "timestamp": "2025-09-05 09:08:44.212763", "step": 2536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:44.400117", "step": 2536, "epoch": 2 }, { "type": "loss", "content": 0.22080212831497192, "timestamp": "2025-09-05 09:08:44.403454", "step": 2537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:44.610140", "step": 2537, "epoch": 2 }, { "type": "loss", "content": 0.2120576649904251, "timestamp": "2025-09-05 09:08:44.613452", "step": 2538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:44.862843", "step": 2538, "epoch": 2 }, { "type": "loss", "content": 0.17697148025035858, "timestamp": "2025-09-05 09:08:44.864931", "step": 2539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:45.040233", "step": 2539, "epoch": 2 }, { "type": "loss", "content": 0.37181296944618225, "timestamp": "2025-09-05 09:08:45.055138", "step": 2540, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:08:50.212502", "step": 2540, "epoch": 2 }, { "type": "pplx", "content": 53.064035598436966, "timestamp": "2025-09-05 09:08:50.214408", "step": 2540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:50.346568", "step": 2540, "epoch": 2 }, { "type": "loss", "content": 0.3222182095050812, "timestamp": "2025-09-05 09:08:50.348433", "step": 2541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:50.484589", "step": 2541, "epoch": 2 }, { "type": "loss", "content": 0.15369223058223724, "timestamp": "2025-09-05 09:08:50.486899", "step": 2542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:50.622707", "step": 2542, "epoch": 2 }, { "type": "loss", "content": 0.37195268273353577, "timestamp": "2025-09-05 09:08:50.624641", "step": 2543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:50.761152", "step": 2543, "epoch": 2 }, { "type": "loss", "content": 0.3791513741016388, "timestamp": "2025-09-05 09:08:50.769844", "step": 2544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:50.902437", "step": 2544, "epoch": 2 }, { "type": "loss", "content": 0.25493812561035156, "timestamp": "2025-09-05 09:08:50.904432", "step": 2545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:51.062401", "step": 2545, "epoch": 2 }, { "type": "loss", "content": 0.34883612394332886, "timestamp": "2025-09-05 09:08:51.064212", "step": 2546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:08:51.235180", "step": 2546, "epoch": 2 }, { "type": "loss", "content": 0.36856377124786377, "timestamp": "2025-09-05 09:08:51.237348", "step": 2547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:51.372850", "step": 2547, "epoch": 2 }, { "type": "loss", "content": 0.27879372239112854, "timestamp": "2025-09-05 09:08:51.386856", "step": 2548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:51.538291", "step": 2548, "epoch": 2 }, { "type": "loss", "content": 0.26352420449256897, "timestamp": "2025-09-05 09:08:51.539972", "step": 2549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:51.709065", "step": 2549, "epoch": 2 }, { "type": "loss", "content": 0.3046532869338989, "timestamp": "2025-09-05 09:08:51.711027", "step": 2550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:08:51.872075", "step": 2550, "epoch": 2 }, { "type": "loss", "content": 0.45060494542121887, "timestamp": "2025-09-05 09:08:51.876519", "step": 2551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:52.036405", "step": 2551, "epoch": 2 }, { "type": "loss", "content": 0.4088442027568817, "timestamp": "2025-09-05 09:08:52.050278", "step": 2552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:52.201867", "step": 2552, "epoch": 2 }, { "type": "loss", "content": 0.2506445646286011, "timestamp": "2025-09-05 09:08:52.203855", "step": 2553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:52.361528", "step": 2553, "epoch": 2 }, { "type": "loss", "content": 0.2608177363872528, "timestamp": "2025-09-05 09:08:52.363491", "step": 2554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:52.521849", "step": 2554, "epoch": 2 }, { "type": "loss", "content": 0.35635024309158325, "timestamp": "2025-09-05 09:08:52.523750", "step": 2555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:52.681338", "step": 2555, "epoch": 2 }, { "type": "loss", "content": 0.3468969166278839, "timestamp": "2025-09-05 09:08:52.696825", "step": 2556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:52.855565", "step": 2556, "epoch": 2 }, { "type": "loss", "content": 0.421159029006958, "timestamp": "2025-09-05 09:08:52.857810", "step": 2557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:08:53.018521", "step": 2557, "epoch": 2 }, { "type": "loss", "content": 0.3261510729789734, "timestamp": "2025-09-05 09:08:53.020341", "step": 2558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:53.190276", "step": 2558, "epoch": 2 }, { "type": "loss", "content": 0.4048421084880829, "timestamp": "2025-09-05 09:08:53.192324", "step": 2559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:08:53.362579", "step": 2559, "epoch": 2 }, { "type": "loss", "content": 0.41077789664268494, "timestamp": "2025-09-05 09:08:53.378509", "step": 2560, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:08:58.019506", "step": 2560, "epoch": 2 }, { "type": "pplx", "content": 54.037454830465165, "timestamp": "2025-09-05 09:08:58.021631", "step": 2560, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2560", "timestamp": "2025-09-05 09:08:58.543085", "step": 2560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:58.681373", "step": 2560, "epoch": 2 }, { "type": "loss", "content": 0.25367656350135803, "timestamp": "2025-09-05 09:08:58.683466", "step": 2561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:58.839482", "step": 2561, "epoch": 2 }, { "type": "loss", "content": 0.2890976667404175, "timestamp": "2025-09-05 09:08:58.841709", "step": 2562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:58.999471", "step": 2562, "epoch": 2 }, { "type": "loss", "content": 0.3569534718990326, "timestamp": "2025-09-05 09:08:59.001771", "step": 2563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:08:59.170317", "step": 2563, "epoch": 2 }, { "type": "loss", "content": 0.26353394985198975, "timestamp": "2025-09-05 09:08:59.183922", "step": 2564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:59.337378", "step": 2564, "epoch": 2 }, { "type": "loss", "content": 0.278327077627182, "timestamp": "2025-09-05 09:08:59.339152", "step": 2565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:08:59.506890", "step": 2565, "epoch": 2 }, { "type": "loss", "content": 0.4738348126411438, "timestamp": "2025-09-05 09:08:59.508929", "step": 2566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:08:59.666733", "step": 2566, "epoch": 2 }, { "type": "loss", "content": 0.22108004987239838, "timestamp": "2025-09-05 09:08:59.668438", "step": 2567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:59.824336", "step": 2567, "epoch": 2 }, { "type": "loss", "content": 0.3641687333583832, "timestamp": "2025-09-05 09:08:59.838720", "step": 2568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:08:59.997932", "step": 2568, "epoch": 2 }, { "type": "loss", "content": 0.2895580232143402, "timestamp": "2025-09-05 09:09:00.000313", "step": 2569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:00.135261", "step": 2569, "epoch": 2 }, { "type": "loss", "content": 0.3097214698791504, "timestamp": "2025-09-05 09:09:00.137099", "step": 2570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:00.304405", "step": 2570, "epoch": 2 }, { "type": "loss", "content": 0.3036538064479828, "timestamp": "2025-09-05 09:09:00.306729", "step": 2571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:00.463817", "step": 2571, "epoch": 2 }, { "type": "loss", "content": 0.3194674849510193, "timestamp": "2025-09-05 09:09:00.477410", "step": 2572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:00.628501", "step": 2572, "epoch": 2 }, { "type": "loss", "content": 0.3619299829006195, "timestamp": "2025-09-05 09:09:00.630635", "step": 2573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:00.790157", "step": 2573, "epoch": 2 }, { "type": "loss", "content": 0.2202121913433075, "timestamp": "2025-09-05 09:09:00.792091", "step": 2574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:00.948683", "step": 2574, "epoch": 2 }, { "type": "loss", "content": 0.2178782969713211, "timestamp": "2025-09-05 09:09:00.950541", "step": 2575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:01.121052", "step": 2575, "epoch": 2 }, { "type": "loss", "content": 0.30588048696517944, "timestamp": "2025-09-05 09:09:01.135032", "step": 2576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:09:01.288635", "step": 2576, "epoch": 2 }, { "type": "loss", "content": 0.252217561006546, "timestamp": "2025-09-05 09:09:01.290940", "step": 2577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:09:01.460305", "step": 2577, "epoch": 2 }, { "type": "loss", "content": 0.3836561143398285, "timestamp": "2025-09-05 09:09:01.462184", "step": 2578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:01.632772", "step": 2578, "epoch": 2 }, { "type": "loss", "content": 0.14467762410640717, "timestamp": "2025-09-05 09:09:01.635088", "step": 2579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:01.772020", "step": 2579, "epoch": 2 }, { "type": "loss", "content": 0.36755555868148804, "timestamp": "2025-09-05 09:09:01.788192", "step": 2580, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:09:06.435373", "step": 2580, "epoch": 2 }, { "type": "pplx", "content": 54.81730763687925, "timestamp": "2025-09-05 09:09:06.438007", "step": 2580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:06.570269", "step": 2580, "epoch": 2 }, { "type": "loss", "content": 0.3310108780860901, "timestamp": "2025-09-05 09:09:06.572346", "step": 2581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:06.736690", "step": 2581, "epoch": 2 }, { "type": "loss", "content": 0.3351535499095917, "timestamp": "2025-09-05 09:09:06.738837", "step": 2582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:06.902060", "step": 2582, "epoch": 2 }, { "type": "loss", "content": 0.3105543851852417, "timestamp": "2025-09-05 09:09:06.904103", "step": 2583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:07.068138", "step": 2583, "epoch": 2 }, { "type": "loss", "content": 0.3798165023326874, "timestamp": "2025-09-05 09:09:07.081927", "step": 2584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:07.239587", "step": 2584, "epoch": 2 }, { "type": "loss", "content": 0.2240927517414093, "timestamp": "2025-09-05 09:09:07.241587", "step": 2585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:09:07.446448", "step": 2585, "epoch": 2 }, { "type": "loss", "content": 0.44238588213920593, "timestamp": "2025-09-05 09:09:07.448922", "step": 2586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:07.617891", "step": 2586, "epoch": 2 }, { "type": "loss", "content": 0.3627106845378876, "timestamp": "2025-09-05 09:09:07.620036", "step": 2587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:09:07.788432", "step": 2587, "epoch": 2 }, { "type": "loss", "content": 0.41372790932655334, "timestamp": "2025-09-05 09:09:07.797290", "step": 2588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:07.933122", "step": 2588, "epoch": 2 }, { "type": "loss", "content": 0.1950422078371048, "timestamp": "2025-09-05 09:09:07.935093", "step": 2589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:08.070991", "step": 2589, "epoch": 2 }, { "type": "loss", "content": 0.23546253144741058, "timestamp": "2025-09-05 09:09:08.073472", "step": 2590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:08.251746", "step": 2590, "epoch": 2 }, { "type": "loss", "content": 0.24141792953014374, "timestamp": "2025-09-05 09:09:08.254326", "step": 2591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:08.425452", "step": 2591, "epoch": 2 }, { "type": "loss", "content": 0.3847906291484833, "timestamp": "2025-09-05 09:09:08.439722", "step": 2592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:08.634658", "step": 2592, "epoch": 2 }, { "type": "loss", "content": 0.34318065643310547, "timestamp": "2025-09-05 09:09:08.637294", "step": 2593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:08.822224", "step": 2593, "epoch": 2 }, { "type": "loss", "content": 0.3513537645339966, "timestamp": "2025-09-05 09:09:08.824692", "step": 2594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:09.010563", "step": 2594, "epoch": 2 }, { "type": "loss", "content": 0.2467866986989975, "timestamp": "2025-09-05 09:09:09.014666", "step": 2595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:09.204328", "step": 2595, "epoch": 2 }, { "type": "loss", "content": 0.31491708755493164, "timestamp": "2025-09-05 09:09:09.221207", "step": 2596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:09.387748", "step": 2596, "epoch": 2 }, { "type": "loss", "content": 0.42033904790878296, "timestamp": "2025-09-05 09:09:09.390084", "step": 2597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:09.556120", "step": 2597, "epoch": 2 }, { "type": "loss", "content": 0.25425952672958374, "timestamp": "2025-09-05 09:09:09.558324", "step": 2598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:09.721976", "step": 2598, "epoch": 2 }, { "type": "loss", "content": 0.2386419177055359, "timestamp": "2025-09-05 09:09:09.724701", "step": 2599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:09.888119", "step": 2599, "epoch": 2 }, { "type": "loss", "content": 0.1672760248184204, "timestamp": "2025-09-05 09:09:09.902009", "step": 2600, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:09:14.531558", "step": 2600, "epoch": 2 }, { "type": "pplx", "content": 54.832467462479364, "timestamp": "2025-09-05 09:09:14.533780", "step": 2600, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2600", "timestamp": "2025-09-05 09:09:15.004154", "step": 2600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:15.143984", "step": 2600, "epoch": 3 }, { "type": "loss", "content": 0.2909306585788727, "timestamp": "2025-09-05 09:09:15.145976", "step": 2601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:15.310389", "step": 2601, "epoch": 3 }, { "type": "loss", "content": 0.369361937046051, "timestamp": "2025-09-05 09:09:15.312190", "step": 2602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:15.475409", "step": 2602, "epoch": 3 }, { "type": "loss", "content": 0.2690865099430084, "timestamp": "2025-09-05 09:09:15.477319", "step": 2603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:15.649941", "step": 2603, "epoch": 3 }, { "type": "loss", "content": 0.41636621952056885, "timestamp": "2025-09-05 09:09:15.664165", "step": 2604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:15.829025", "step": 2604, "epoch": 3 }, { "type": "loss", "content": 0.24433229863643646, "timestamp": "2025-09-05 09:09:15.830945", "step": 2605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:15.996184", "step": 2605, "epoch": 3 }, { "type": "loss", "content": 0.3325082063674927, "timestamp": "2025-09-05 09:09:15.998155", "step": 2606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:16.161457", "step": 2606, "epoch": 3 }, { "type": "loss", "content": 0.2946546971797943, "timestamp": "2025-09-05 09:09:16.163593", "step": 2607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:16.327289", "step": 2607, "epoch": 3 }, { "type": "loss", "content": 0.46848660707473755, "timestamp": "2025-09-05 09:09:16.341734", "step": 2608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:16.498726", "step": 2608, "epoch": 3 }, { "type": "loss", "content": 0.39881542325019836, "timestamp": "2025-09-05 09:09:16.502038", "step": 2609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:16.669668", "step": 2609, "epoch": 3 }, { "type": "loss", "content": 0.3879721462726593, "timestamp": "2025-09-05 09:09:16.671710", "step": 2610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:16.844414", "step": 2610, "epoch": 3 }, { "type": "loss", "content": 0.3810281455516815, "timestamp": "2025-09-05 09:09:16.846276", "step": 2611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:17.020193", "step": 2611, "epoch": 3 }, { "type": "loss", "content": 0.41419529914855957, "timestamp": "2025-09-05 09:09:17.034287", "step": 2612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:17.192268", "step": 2612, "epoch": 3 }, { "type": "loss", "content": 0.296019047498703, "timestamp": "2025-09-05 09:09:17.195977", "step": 2613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:17.359277", "step": 2613, "epoch": 3 }, { "type": "loss", "content": 0.39310163259506226, "timestamp": "2025-09-05 09:09:17.361128", "step": 2614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:17.519311", "step": 2614, "epoch": 3 }, { "type": "loss", "content": 0.2674983739852905, "timestamp": "2025-09-05 09:09:17.521442", "step": 2615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:17.678767", "step": 2615, "epoch": 3 }, { "type": "loss", "content": 0.34225496649742126, "timestamp": "2025-09-05 09:09:17.692054", "step": 2616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:17.843075", "step": 2616, "epoch": 3 }, { "type": "loss", "content": 0.36751917004585266, "timestamp": "2025-09-05 09:09:17.845103", "step": 2617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:18.001589", "step": 2617, "epoch": 3 }, { "type": "loss", "content": 0.28645431995391846, "timestamp": "2025-09-05 09:09:18.003740", "step": 2618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:18.163572", "step": 2618, "epoch": 3 }, { "type": "loss", "content": 0.27839794754981995, "timestamp": "2025-09-05 09:09:18.165439", "step": 2619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:18.322425", "step": 2619, "epoch": 3 }, { "type": "loss", "content": 0.28111451864242554, "timestamp": "2025-09-05 09:09:18.336168", "step": 2620, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:09:22.963303", "step": 2620, "epoch": 3 }, { "type": "pplx", "content": 54.74175043223115, "timestamp": "2025-09-05 09:09:22.965443", "step": 2620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:23.097523", "step": 2620, "epoch": 3 }, { "type": "loss", "content": 0.41243675351142883, "timestamp": "2025-09-05 09:09:23.099641", "step": 2621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:23.236457", "step": 2621, "epoch": 3 }, { "type": "loss", "content": 0.4278426766395569, "timestamp": "2025-09-05 09:09:23.238298", "step": 2622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:23.374088", "step": 2622, "epoch": 3 }, { "type": "loss", "content": 0.25342079997062683, "timestamp": "2025-09-05 09:09:23.376267", "step": 2623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:23.513612", "step": 2623, "epoch": 3 }, { "type": "loss", "content": 0.4077974259853363, "timestamp": "2025-09-05 09:09:23.522957", "step": 2624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:23.657452", "step": 2624, "epoch": 3 }, { "type": "loss", "content": 0.2912677526473999, "timestamp": "2025-09-05 09:09:23.659379", "step": 2625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:23.794514", "step": 2625, "epoch": 3 }, { "type": "loss", "content": 0.28832995891571045, "timestamp": "2025-09-05 09:09:23.796204", "step": 2626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:23.931743", "step": 2626, "epoch": 3 }, { "type": "loss", "content": 0.3055991530418396, "timestamp": "2025-09-05 09:09:23.934138", "step": 2627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:24.103487", "step": 2627, "epoch": 3 }, { "type": "loss", "content": 0.3616314232349396, "timestamp": "2025-09-05 09:09:24.118124", "step": 2628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:24.272435", "step": 2628, "epoch": 3 }, { "type": "loss", "content": 0.29930540919303894, "timestamp": "2025-09-05 09:09:24.274499", "step": 2629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:24.432410", "step": 2629, "epoch": 3 }, { "type": "loss", "content": 0.27770620584487915, "timestamp": "2025-09-05 09:09:24.434428", "step": 2630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:24.603178", "step": 2630, "epoch": 3 }, { "type": "loss", "content": 0.4240753948688507, "timestamp": "2025-09-05 09:09:24.604981", "step": 2631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:24.739831", "step": 2631, "epoch": 3 }, { "type": "loss", "content": 0.2943328619003296, "timestamp": "2025-09-05 09:09:24.755715", "step": 2632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:24.915341", "step": 2632, "epoch": 3 }, { "type": "loss", "content": 0.28002142906188965, "timestamp": "2025-09-05 09:09:24.917749", "step": 2633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:25.074494", "step": 2633, "epoch": 3 }, { "type": "loss", "content": 0.3681841194629669, "timestamp": "2025-09-05 09:09:25.076991", "step": 2634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:25.242111", "step": 2634, "epoch": 3 }, { "type": "loss", "content": 0.3582687973976135, "timestamp": "2025-09-05 09:09:25.244392", "step": 2635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:25.403267", "step": 2635, "epoch": 3 }, { "type": "loss", "content": 0.38165122270584106, "timestamp": "2025-09-05 09:09:25.417274", "step": 2636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:25.569052", "step": 2636, "epoch": 3 }, { "type": "loss", "content": 0.3990744352340698, "timestamp": "2025-09-05 09:09:25.571190", "step": 2637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:25.733478", "step": 2637, "epoch": 3 }, { "type": "loss", "content": 0.33937230706214905, "timestamp": "2025-09-05 09:09:25.735736", "step": 2638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:25.911457", "step": 2638, "epoch": 3 }, { "type": "loss", "content": 0.3162641227245331, "timestamp": "2025-09-05 09:09:25.913737", "step": 2639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:26.072299", "step": 2639, "epoch": 3 }, { "type": "loss", "content": 0.2395915687084198, "timestamp": "2025-09-05 09:09:26.088449", "step": 2640, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:09:30.715372", "step": 2640, "epoch": 3 }, { "type": "pplx", "content": 54.28784188000765, "timestamp": "2025-09-05 09:09:30.717835", "step": 2640, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2640", "timestamp": "2025-09-05 09:09:31.194816", "step": 2640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:31.341771", "step": 2640, "epoch": 3 }, { "type": "loss", "content": 0.3118651211261749, "timestamp": "2025-09-05 09:09:31.344881", "step": 2641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:31.503551", "step": 2641, "epoch": 3 }, { "type": "loss", "content": 0.33012741804122925, "timestamp": "2025-09-05 09:09:31.505717", "step": 2642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:31.662794", "step": 2642, "epoch": 3 }, { "type": "loss", "content": 0.376676470041275, "timestamp": "2025-09-05 09:09:31.665311", "step": 2643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:31.833512", "step": 2643, "epoch": 3 }, { "type": "loss", "content": 0.30343472957611084, "timestamp": "2025-09-05 09:09:31.847642", "step": 2644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:32.002431", "step": 2644, "epoch": 3 }, { "type": "loss", "content": 0.3581269681453705, "timestamp": "2025-09-05 09:09:32.004501", "step": 2645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:32.173097", "step": 2645, "epoch": 3 }, { "type": "loss", "content": 0.22369900345802307, "timestamp": "2025-09-05 09:09:32.174951", "step": 2646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:32.309856", "step": 2646, "epoch": 3 }, { "type": "loss", "content": 0.3513410985469818, "timestamp": "2025-09-05 09:09:32.312072", "step": 2647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:32.481357", "step": 2647, "epoch": 3 }, { "type": "loss", "content": 0.44361498951911926, "timestamp": "2025-09-05 09:09:32.495464", "step": 2648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:32.649538", "step": 2648, "epoch": 3 }, { "type": "loss", "content": 0.2907888889312744, "timestamp": "2025-09-05 09:09:32.651742", "step": 2649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:32.808844", "step": 2649, "epoch": 3 }, { "type": "loss", "content": 0.37796589732170105, "timestamp": "2025-09-05 09:09:32.810776", "step": 2650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:32.968897", "step": 2650, "epoch": 3 }, { "type": "loss", "content": 0.29424649477005005, "timestamp": "2025-09-05 09:09:32.970994", "step": 2651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:33.129396", "step": 2651, "epoch": 3 }, { "type": "loss", "content": 0.36393117904663086, "timestamp": "2025-09-05 09:09:33.145047", "step": 2652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:33.310349", "step": 2652, "epoch": 3 }, { "type": "loss", "content": 0.3564506769180298, "timestamp": "2025-09-05 09:09:33.312321", "step": 2653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:33.478108", "step": 2653, "epoch": 3 }, { "type": "loss", "content": 0.20433595776557922, "timestamp": "2025-09-05 09:09:33.479956", "step": 2654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:33.638178", "step": 2654, "epoch": 3 }, { "type": "loss", "content": 0.38941457867622375, "timestamp": "2025-09-05 09:09:33.641092", "step": 2655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:33.798976", "step": 2655, "epoch": 3 }, { "type": "loss", "content": 0.276498019695282, "timestamp": "2025-09-05 09:09:33.813114", "step": 2656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:33.964685", "step": 2656, "epoch": 3 }, { "type": "loss", "content": 0.22760221362113953, "timestamp": "2025-09-05 09:09:33.966648", "step": 2657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:34.133765", "step": 2657, "epoch": 3 }, { "type": "loss", "content": 0.39622488617897034, "timestamp": "2025-09-05 09:09:34.135682", "step": 2658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:34.293759", "step": 2658, "epoch": 3 }, { "type": "loss", "content": 0.3628631830215454, "timestamp": "2025-09-05 09:09:34.295822", "step": 2659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:34.453730", "step": 2659, "epoch": 3 }, { "type": "loss", "content": 0.28753095865249634, "timestamp": "2025-09-05 09:09:34.467476", "step": 2660, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:09:39.114526", "step": 2660, "epoch": 3 }, { "type": "pplx", "content": 54.31525715361568, "timestamp": "2025-09-05 09:09:39.116914", "step": 2660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:39.253139", "step": 2660, "epoch": 3 }, { "type": "loss", "content": 0.25859639048576355, "timestamp": "2025-09-05 09:09:39.264102", "step": 2661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:39.404262", "step": 2661, "epoch": 3 }, { "type": "loss", "content": 0.23019303381443024, "timestamp": "2025-09-05 09:09:39.407220", "step": 2662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:39.581353", "step": 2662, "epoch": 3 }, { "type": "loss", "content": 0.5155594348907471, "timestamp": "2025-09-05 09:09:39.583174", "step": 2663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:39.741092", "step": 2663, "epoch": 3 }, { "type": "loss", "content": 0.37276023626327515, "timestamp": "2025-09-05 09:09:39.755490", "step": 2664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:39.907403", "step": 2664, "epoch": 3 }, { "type": "loss", "content": 0.41703230142593384, "timestamp": "2025-09-05 09:09:39.909321", "step": 2665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:40.066875", "step": 2665, "epoch": 3 }, { "type": "loss", "content": 0.2926308512687683, "timestamp": "2025-09-05 09:09:40.068643", "step": 2666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:40.226508", "step": 2666, "epoch": 3 }, { "type": "loss", "content": 0.4091567099094391, "timestamp": "2025-09-05 09:09:40.228193", "step": 2667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:40.397210", "step": 2667, "epoch": 3 }, { "type": "loss", "content": 0.26959991455078125, "timestamp": "2025-09-05 09:09:40.414775", "step": 2668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:40.572190", "step": 2668, "epoch": 3 }, { "type": "loss", "content": 0.2950553894042969, "timestamp": "2025-09-05 09:09:40.587593", "step": 2669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:40.769007", "step": 2669, "epoch": 3 }, { "type": "loss", "content": 0.35082414746284485, "timestamp": "2025-09-05 09:09:40.771348", "step": 2670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:40.933475", "step": 2670, "epoch": 3 }, { "type": "loss", "content": 0.2740470767021179, "timestamp": "2025-09-05 09:09:40.935596", "step": 2671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:41.106478", "step": 2671, "epoch": 3 }, { "type": "loss", "content": 0.3159100115299225, "timestamp": "2025-09-05 09:09:41.115332", "step": 2672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:41.249508", "step": 2672, "epoch": 3 }, { "type": "loss", "content": 0.4648512303829193, "timestamp": "2025-09-05 09:09:41.251877", "step": 2673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:09:41.409328", "step": 2673, "epoch": 3 }, { "type": "loss", "content": 0.3527366518974304, "timestamp": "2025-09-05 09:09:41.411381", "step": 2674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:41.584798", "step": 2674, "epoch": 3 }, { "type": "loss", "content": 0.26771530508995056, "timestamp": "2025-09-05 09:09:41.586705", "step": 2675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:41.745088", "step": 2675, "epoch": 3 }, { "type": "loss", "content": 0.34135565161705017, "timestamp": "2025-09-05 09:09:41.761326", "step": 2676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:41.914539", "step": 2676, "epoch": 3 }, { "type": "loss", "content": 0.35681870579719543, "timestamp": "2025-09-05 09:09:41.920362", "step": 2677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:42.083593", "step": 2677, "epoch": 3 }, { "type": "loss", "content": 0.22552332282066345, "timestamp": "2025-09-05 09:09:42.085921", "step": 2678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:42.255612", "step": 2678, "epoch": 3 }, { "type": "loss", "content": 0.24503037333488464, "timestamp": "2025-09-05 09:09:42.257451", "step": 2679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:42.394045", "step": 2679, "epoch": 3 }, { "type": "loss", "content": 0.228922039270401, "timestamp": "2025-09-05 09:09:42.410435", "step": 2680, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:09:47.123710", "step": 2680, "epoch": 3 }, { "type": "pplx", "content": 54.31896988097937, "timestamp": "2025-09-05 09:09:47.126173", "step": 2680, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2680", "timestamp": "2025-09-05 09:09:47.641795", "step": 2680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:47.775868", "step": 2680, "epoch": 3 }, { "type": "loss", "content": 0.3755874037742615, "timestamp": "2025-09-05 09:09:47.778167", "step": 2681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:47.915328", "step": 2681, "epoch": 3 }, { "type": "loss", "content": 0.30692851543426514, "timestamp": "2025-09-05 09:09:47.917502", "step": 2682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:48.087264", "step": 2682, "epoch": 3 }, { "type": "loss", "content": 0.41206371784210205, "timestamp": "2025-09-05 09:09:48.089321", "step": 2683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:48.249641", "step": 2683, "epoch": 3 }, { "type": "loss", "content": 0.38735464215278625, "timestamp": "2025-09-05 09:09:48.263554", "step": 2684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:48.415430", "step": 2684, "epoch": 3 }, { "type": "loss", "content": 0.27017661929130554, "timestamp": "2025-09-05 09:09:48.417668", "step": 2685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:48.553584", "step": 2685, "epoch": 3 }, { "type": "loss", "content": 0.3175680637359619, "timestamp": "2025-09-05 09:09:48.555889", "step": 2686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:48.723796", "step": 2686, "epoch": 3 }, { "type": "loss", "content": 0.3680326044559479, "timestamp": "2025-09-05 09:09:48.725538", "step": 2687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:09:48.885954", "step": 2687, "epoch": 3 }, { "type": "loss", "content": 0.29450035095214844, "timestamp": "2025-09-05 09:09:48.900149", "step": 2688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:49.056183", "step": 2688, "epoch": 3 }, { "type": "loss", "content": 0.4046458303928375, "timestamp": "2025-09-05 09:09:49.058643", "step": 2689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:49.216287", "step": 2689, "epoch": 3 }, { "type": "loss", "content": 0.3243006765842438, "timestamp": "2025-09-05 09:09:49.218640", "step": 2690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:49.376779", "step": 2690, "epoch": 3 }, { "type": "loss", "content": 0.2823656499385834, "timestamp": "2025-09-05 09:09:49.379245", "step": 2691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:49.557827", "step": 2691, "epoch": 3 }, { "type": "loss", "content": 0.3415341079235077, "timestamp": "2025-09-05 09:09:49.574030", "step": 2692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:49.735573", "step": 2692, "epoch": 3 }, { "type": "loss", "content": 0.20649684965610504, "timestamp": "2025-09-05 09:09:49.737926", "step": 2693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:49.897370", "step": 2693, "epoch": 3 }, { "type": "loss", "content": 0.41073697805404663, "timestamp": "2025-09-05 09:09:49.902347", "step": 2694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:09:50.080020", "step": 2694, "epoch": 3 }, { "type": "loss", "content": 0.4066583514213562, "timestamp": "2025-09-05 09:09:50.083816", "step": 2695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:50.257701", "step": 2695, "epoch": 3 }, { "type": "loss", "content": 0.43231305480003357, "timestamp": "2025-09-05 09:09:50.271674", "step": 2696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:50.425120", "step": 2696, "epoch": 3 }, { "type": "loss", "content": 0.43370968103408813, "timestamp": "2025-09-05 09:09:50.427885", "step": 2697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:50.587228", "step": 2697, "epoch": 3 }, { "type": "loss", "content": 0.3017624616622925, "timestamp": "2025-09-05 09:09:50.589376", "step": 2698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:50.747465", "step": 2698, "epoch": 3 }, { "type": "loss", "content": 0.49601367115974426, "timestamp": "2025-09-05 09:09:50.749698", "step": 2699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:50.919572", "step": 2699, "epoch": 3 }, { "type": "loss", "content": 0.3051532208919525, "timestamp": "2025-09-05 09:09:50.934651", "step": 2700, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:09:55.581191", "step": 2700, "epoch": 3 }, { "type": "pplx", "content": 54.71914685947849, "timestamp": "2025-09-05 09:09:55.583324", "step": 2700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:55.715082", "step": 2700, "epoch": 3 }, { "type": "loss", "content": 0.24363669753074646, "timestamp": "2025-09-05 09:09:55.717265", "step": 2701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:09:55.873487", "step": 2701, "epoch": 3 }, { "type": "loss", "content": 0.3039955198764801, "timestamp": "2025-09-05 09:09:55.875289", "step": 2702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:56.042793", "step": 2702, "epoch": 3 }, { "type": "loss", "content": 0.3395662307739258, "timestamp": "2025-09-05 09:09:56.044668", "step": 2703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:56.180369", "step": 2703, "epoch": 3 }, { "type": "loss", "content": 0.23626942932605743, "timestamp": "2025-09-05 09:09:56.196460", "step": 2704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:56.356335", "step": 2704, "epoch": 3 }, { "type": "loss", "content": 0.3257940411567688, "timestamp": "2025-09-05 09:09:56.358374", "step": 2705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:56.516397", "step": 2705, "epoch": 3 }, { "type": "loss", "content": 0.3324272930622101, "timestamp": "2025-09-05 09:09:56.519005", "step": 2706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:56.658125", "step": 2706, "epoch": 3 }, { "type": "loss", "content": 0.18577173352241516, "timestamp": "2025-09-05 09:09:56.661453", "step": 2707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:56.843395", "step": 2707, "epoch": 3 }, { "type": "loss", "content": 0.3146699070930481, "timestamp": "2025-09-05 09:09:56.860248", "step": 2708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:09:57.033006", "step": 2708, "epoch": 3 }, { "type": "loss", "content": 0.23967741429805756, "timestamp": "2025-09-05 09:09:57.036268", "step": 2709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:57.212345", "step": 2709, "epoch": 3 }, { "type": "loss", "content": 0.23511698842048645, "timestamp": "2025-09-05 09:09:57.214817", "step": 2710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:57.392545", "step": 2710, "epoch": 3 }, { "type": "loss", "content": 0.28559327125549316, "timestamp": "2025-09-05 09:09:57.394953", "step": 2711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:57.569750", "step": 2711, "epoch": 3 }, { "type": "loss", "content": 0.5006580352783203, "timestamp": "2025-09-05 09:09:57.584025", "step": 2712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:57.742993", "step": 2712, "epoch": 3 }, { "type": "loss", "content": 0.2922325134277344, "timestamp": "2025-09-05 09:09:57.751655", "step": 2713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:09:57.922784", "step": 2713, "epoch": 3 }, { "type": "loss", "content": 0.17340846359729767, "timestamp": "2025-09-05 09:09:57.930195", "step": 2714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:09:58.110260", "step": 2714, "epoch": 3 }, { "type": "loss", "content": 0.22841380536556244, "timestamp": "2025-09-05 09:09:58.113975", "step": 2715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:58.276525", "step": 2715, "epoch": 3 }, { "type": "loss", "content": 0.3006599247455597, "timestamp": "2025-09-05 09:09:58.293741", "step": 2716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:09:58.459743", "step": 2716, "epoch": 3 }, { "type": "loss", "content": 0.3553715646266937, "timestamp": "2025-09-05 09:09:58.464739", "step": 2717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:58.636391", "step": 2717, "epoch": 3 }, { "type": "loss", "content": 0.31938436627388, "timestamp": "2025-09-05 09:09:58.639450", "step": 2718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:09:58.819436", "step": 2718, "epoch": 3 }, { "type": "loss", "content": 0.22373679280281067, "timestamp": "2025-09-05 09:09:58.822596", "step": 2719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:09:59.011310", "step": 2719, "epoch": 3 }, { "type": "loss", "content": 0.20499032735824585, "timestamp": "2025-09-05 09:09:59.033254", "step": 2720, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:10:03.869184", "step": 2720, "epoch": 3 }, { "type": "pplx", "content": 55.399467890917855, "timestamp": "2025-09-05 09:10:03.871173", "step": 2720, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2720", "timestamp": "2025-09-05 09:10:04.314045", "step": 2720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:04.453237", "step": 2720, "epoch": 3 }, { "type": "loss", "content": 0.3693501055240631, "timestamp": "2025-09-05 09:10:04.455637", "step": 2721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:04.623810", "step": 2721, "epoch": 3 }, { "type": "loss", "content": 0.2824878990650177, "timestamp": "2025-09-05 09:10:04.627198", "step": 2722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:04.787875", "step": 2722, "epoch": 3 }, { "type": "loss", "content": 0.307536244392395, "timestamp": "2025-09-05 09:10:04.790194", "step": 2723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:04.950797", "step": 2723, "epoch": 3 }, { "type": "loss", "content": 0.2077270895242691, "timestamp": "2025-09-05 09:10:04.959505", "step": 2724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:05.097999", "step": 2724, "epoch": 3 }, { "type": "loss", "content": 0.2842904031276703, "timestamp": "2025-09-05 09:10:05.102783", "step": 2725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:05.262814", "step": 2725, "epoch": 3 }, { "type": "loss", "content": 0.30767548084259033, "timestamp": "2025-09-05 09:10:05.266837", "step": 2726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:05.426245", "step": 2726, "epoch": 3 }, { "type": "loss", "content": 0.41483354568481445, "timestamp": "2025-09-05 09:10:05.429925", "step": 2727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:05.592194", "step": 2727, "epoch": 3 }, { "type": "loss", "content": 0.2677007019519806, "timestamp": "2025-09-05 09:10:05.606152", "step": 2728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:05.761691", "step": 2728, "epoch": 3 }, { "type": "loss", "content": 0.3418111801147461, "timestamp": "2025-09-05 09:10:05.763999", "step": 2729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:05.901368", "step": 2729, "epoch": 3 }, { "type": "loss", "content": 0.26328641176223755, "timestamp": "2025-09-05 09:10:05.905885", "step": 2730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:06.077778", "step": 2730, "epoch": 3 }, { "type": "loss", "content": 0.287534236907959, "timestamp": "2025-09-05 09:10:06.079870", "step": 2731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:06.218060", "step": 2731, "epoch": 3 }, { "type": "loss", "content": 0.23290354013442993, "timestamp": "2025-09-05 09:10:06.232682", "step": 2732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:10:06.387980", "step": 2732, "epoch": 3 }, { "type": "loss", "content": 0.44484513998031616, "timestamp": "2025-09-05 09:10:06.390645", "step": 2733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:06.560661", "step": 2733, "epoch": 3 }, { "type": "loss", "content": 0.3004152774810791, "timestamp": "2025-09-05 09:10:06.565345", "step": 2734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:06.728713", "step": 2734, "epoch": 3 }, { "type": "loss", "content": 0.23439401388168335, "timestamp": "2025-09-05 09:10:06.731721", "step": 2735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:06.906635", "step": 2735, "epoch": 3 }, { "type": "loss", "content": 0.39753445982933044, "timestamp": "2025-09-05 09:10:06.921311", "step": 2736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:07.081558", "step": 2736, "epoch": 3 }, { "type": "loss", "content": 0.26311197876930237, "timestamp": "2025-09-05 09:10:07.084292", "step": 2737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:07.246437", "step": 2737, "epoch": 3 }, { "type": "loss", "content": 0.2834315896034241, "timestamp": "2025-09-05 09:10:07.249366", "step": 2738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:07.421327", "step": 2738, "epoch": 3 }, { "type": "loss", "content": 0.35053861141204834, "timestamp": "2025-09-05 09:10:07.424016", "step": 2739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:07.601132", "step": 2739, "epoch": 3 }, { "type": "loss", "content": 0.2513752579689026, "timestamp": "2025-09-05 09:10:07.618283", "step": 2740, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:10:12.409186", "step": 2740, "epoch": 3 }, { "type": "pplx", "content": 55.606292247536096, "timestamp": "2025-09-05 09:10:12.416713", "step": 2740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:12.551578", "step": 2740, "epoch": 3 }, { "type": "loss", "content": 0.2897755801677704, "timestamp": "2025-09-05 09:10:12.558344", "step": 2741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:12.698223", "step": 2741, "epoch": 3 }, { "type": "loss", "content": 0.3819401264190674, "timestamp": "2025-09-05 09:10:12.701247", "step": 2742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:12.841144", "step": 2742, "epoch": 3 }, { "type": "loss", "content": 0.3809162378311157, "timestamp": "2025-09-05 09:10:12.847553", "step": 2743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:10:12.986801", "step": 2743, "epoch": 3 }, { "type": "loss", "content": 0.27368873357772827, "timestamp": "2025-09-05 09:10:12.996377", "step": 2744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:13.135351", "step": 2744, "epoch": 3 }, { "type": "loss", "content": 0.33143290877342224, "timestamp": "2025-09-05 09:10:13.144607", "step": 2745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:13.288483", "step": 2745, "epoch": 3 }, { "type": "loss", "content": 0.23653104901313782, "timestamp": "2025-09-05 09:10:13.293088", "step": 2746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:13.468711", "step": 2746, "epoch": 3 }, { "type": "loss", "content": 0.37218546867370605, "timestamp": "2025-09-05 09:10:13.471453", "step": 2747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:13.645855", "step": 2747, "epoch": 3 }, { "type": "loss", "content": 0.27283674478530884, "timestamp": "2025-09-05 09:10:13.662936", "step": 2748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:13.822337", "step": 2748, "epoch": 3 }, { "type": "loss", "content": 0.4320858120918274, "timestamp": "2025-09-05 09:10:13.825185", "step": 2749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:14.000505", "step": 2749, "epoch": 3 }, { "type": "loss", "content": 0.2452457696199417, "timestamp": "2025-09-05 09:10:14.014447", "step": 2750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:14.182660", "step": 2750, "epoch": 3 }, { "type": "loss", "content": 0.1904885619878769, "timestamp": "2025-09-05 09:10:14.185589", "step": 2751, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:14.350431", "step": 2751, "epoch": 3 }, { "type": "loss", "content": 0.3453499376773834, "timestamp": "2025-09-05 09:10:14.365541", "step": 2752, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:14.524726", "step": 2752, "epoch": 3 }, { "type": "loss", "content": 0.267494261264801, "timestamp": "2025-09-05 09:10:14.528420", "step": 2753, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:14.716589", "step": 2753, "epoch": 3 }, { "type": "loss", "content": 0.43331682682037354, "timestamp": "2025-09-05 09:10:14.720911", "step": 2754, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:14.901813", "step": 2754, "epoch": 3 }, { "type": "loss", "content": 0.3932355046272278, "timestamp": "2025-09-05 09:10:14.905171", "step": 2755, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:15.085460", "step": 2755, "epoch": 3 }, { "type": "loss", "content": 0.3386424779891968, "timestamp": "2025-09-05 09:10:15.102279", "step": 2756, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:15.279114", "step": 2756, "epoch": 3 }, { "type": "loss", "content": 0.3107830584049225, "timestamp": "2025-09-05 09:10:15.282876", "step": 2757, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:15.461908", "step": 2757, "epoch": 3 }, { "type": "loss", "content": 0.2651340961456299, "timestamp": "2025-09-05 09:10:15.465681", "step": 2758, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:15.630422", "step": 2758, "epoch": 3 }, { "type": "loss", "content": 0.4322452247142792, "timestamp": "2025-09-05 09:10:15.639175", "step": 2759, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:15.802596", "step": 2759, "epoch": 3 }, { "type": "loss", "content": 0.3348475694656372, "timestamp": "2025-09-05 09:10:15.819849", "step": 2760, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:10:20.767860", "step": 2760, "epoch": 3 }, { "type": "pplx", "content": 55.56754759928199, "timestamp": "2025-09-05 09:10:20.772035", "step": 2760, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2760", "timestamp": "2025-09-05 09:10:21.365515", "step": 2760, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:21.595230", "step": 2760, "epoch": 3 }, { "type": "loss", "content": 0.23760850727558136, "timestamp": "2025-09-05 09:10:21.602522", "step": 2761, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:21.765693", "step": 2761, "epoch": 3 }, { "type": "loss", "content": 0.3854387402534485, "timestamp": "2025-09-05 09:10:21.776647", "step": 2762, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:10:21.940738", "step": 2762, "epoch": 3 }, { "type": "loss", "content": 0.3008959889411926, "timestamp": "2025-09-05 09:10:21.944579", "step": 2763, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:22.108064", "step": 2763, "epoch": 3 }, { "type": "loss", "content": 0.2746279537677765, "timestamp": "2025-09-05 09:10:22.124956", "step": 2764, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:22.294008", "step": 2764, "epoch": 3 }, { "type": "loss", "content": 0.28841450810432434, "timestamp": "2025-09-05 09:10:22.296980", "step": 2765, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:22.472863", "step": 2765, "epoch": 3 }, { "type": "loss", "content": 0.20862191915512085, "timestamp": "2025-09-05 09:10:22.475833", "step": 2766, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:22.639204", "step": 2766, "epoch": 3 }, { "type": "loss", "content": 0.2936626076698303, "timestamp": "2025-09-05 09:10:22.648560", "step": 2767, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:22.821191", "step": 2767, "epoch": 3 }, { "type": "loss", "content": 0.3055102825164795, "timestamp": "2025-09-05 09:10:22.846361", "step": 2768, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:23.025985", "step": 2768, "epoch": 3 }, { "type": "loss", "content": 0.42737877368927, "timestamp": "2025-09-05 09:10:23.037055", "step": 2769, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:23.210901", "step": 2769, "epoch": 3 }, { "type": "loss", "content": 0.28168973326683044, "timestamp": "2025-09-05 09:10:23.214976", "step": 2770, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:23.404027", "step": 2770, "epoch": 3 }, { "type": "loss", "content": 0.4235506057739258, "timestamp": "2025-09-05 09:10:23.407613", "step": 2771, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:23.588220", "step": 2771, "epoch": 3 }, { "type": "loss", "content": 0.28592631220817566, "timestamp": "2025-09-05 09:10:23.605025", "step": 2772, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:23.773596", "step": 2772, "epoch": 3 }, { "type": "loss", "content": 0.2634555995464325, "timestamp": "2025-09-05 09:10:23.776889", "step": 2773, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:23.945473", "step": 2773, "epoch": 3 }, { "type": "loss", "content": 0.21984124183654785, "timestamp": "2025-09-05 09:10:23.947745", "step": 2774, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:24.086416", "step": 2774, "epoch": 3 }, { "type": "loss", "content": 0.20653694868087769, "timestamp": "2025-09-05 09:10:24.090554", "step": 2775, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:24.259486", "step": 2775, "epoch": 3 }, { "type": "loss", "content": 0.2279079109430313, "timestamp": "2025-09-05 09:10:24.275952", "step": 2776, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:24.442371", "step": 2776, "epoch": 3 }, { "type": "loss", "content": 0.38293153047561646, "timestamp": "2025-09-05 09:10:24.447664", "step": 2777, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:24.631220", "step": 2777, "epoch": 3 }, { "type": "loss", "content": 0.2632295787334442, "timestamp": "2025-09-05 09:10:24.634597", "step": 2778, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:24.809574", "step": 2778, "epoch": 3 }, { "type": "loss", "content": 0.3497149348258972, "timestamp": "2025-09-05 09:10:24.812800", "step": 2779, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:24.977713", "step": 2779, "epoch": 3 }, { "type": "loss", "content": 0.3435024619102478, "timestamp": "2025-09-05 09:10:24.995774", "step": 2780, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:10:29.786411", "step": 2780, "epoch": 3 }, { "type": "pplx", "content": 55.502120442422544, "timestamp": "2025-09-05 09:10:29.790346", "step": 2780, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:10:29.922857", "step": 2780, "epoch": 3 }, { "type": "loss", "content": 0.2520749866962433, "timestamp": "2025-09-05 09:10:29.932096", "step": 2781, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:30.071586", "step": 2781, "epoch": 3 }, { "type": "loss", "content": 0.2531195282936096, "timestamp": "2025-09-05 09:10:30.074133", "step": 2782, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:30.213336", "step": 2782, "epoch": 3 }, { "type": "loss", "content": 0.41319963335990906, "timestamp": "2025-09-05 09:10:30.218550", "step": 2783, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:30.360495", "step": 2783, "epoch": 3 }, { "type": "loss", "content": 0.25934308767318726, "timestamp": "2025-09-05 09:10:30.376438", "step": 2784, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:30.514182", "step": 2784, "epoch": 3 }, { "type": "loss", "content": 0.16343851387500763, "timestamp": "2025-09-05 09:10:30.517315", "step": 2785, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:30.657243", "step": 2785, "epoch": 3 }, { "type": "loss", "content": 0.34972044825553894, "timestamp": "2025-09-05 09:10:30.660463", "step": 2786, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:30.800154", "step": 2786, "epoch": 3 }, { "type": "loss", "content": 0.3237220346927643, "timestamp": "2025-09-05 09:10:30.806393", "step": 2787, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:30.948423", "step": 2787, "epoch": 3 }, { "type": "loss", "content": 0.2645550072193146, "timestamp": "2025-09-05 09:10:30.959681", "step": 2788, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:31.095759", "step": 2788, "epoch": 3 }, { "type": "loss", "content": 0.3123627305030823, "timestamp": "2025-09-05 09:10:31.099566", "step": 2789, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:31.249554", "step": 2789, "epoch": 3 }, { "type": "loss", "content": 0.2884657680988312, "timestamp": "2025-09-05 09:10:31.252646", "step": 2790, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:31.431942", "step": 2790, "epoch": 3 }, { "type": "loss", "content": 0.22808344662189484, "timestamp": "2025-09-05 09:10:31.434481", "step": 2791, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:31.608196", "step": 2791, "epoch": 3 }, { "type": "loss", "content": 0.4153204560279846, "timestamp": "2025-09-05 09:10:31.627058", "step": 2792, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:31.788114", "step": 2792, "epoch": 3 }, { "type": "loss", "content": 0.24033991992473602, "timestamp": "2025-09-05 09:10:31.790509", "step": 2793, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:31.966073", "step": 2793, "epoch": 3 }, { "type": "loss", "content": 0.2547632157802582, "timestamp": "2025-09-05 09:10:31.970354", "step": 2794, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:32.145056", "step": 2794, "epoch": 3 }, { "type": "loss", "content": 0.3763968050479889, "timestamp": "2025-09-05 09:10:32.147688", "step": 2795, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:32.310468", "step": 2795, "epoch": 3 }, { "type": "loss", "content": 0.4241531491279602, "timestamp": "2025-09-05 09:10:32.324920", "step": 2796, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:32.478476", "step": 2796, "epoch": 3 }, { "type": "loss", "content": 0.2867893874645233, "timestamp": "2025-09-05 09:10:32.481606", "step": 2797, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:32.645290", "step": 2797, "epoch": 3 }, { "type": "loss", "content": 0.2623746991157532, "timestamp": "2025-09-05 09:10:32.650158", "step": 2798, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:32.829402", "step": 2798, "epoch": 3 }, { "type": "loss", "content": 0.353633314371109, "timestamp": "2025-09-05 09:10:32.832494", "step": 2799, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:33.007499", "step": 2799, "epoch": 3 }, { "type": "loss", "content": 0.3284893035888672, "timestamp": "2025-09-05 09:10:33.023668", "step": 2800, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:10:37.838708", "step": 2800, "epoch": 3 }, { "type": "pplx", "content": 56.313607306551525, "timestamp": "2025-09-05 09:10:37.840865", "step": 2800, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2800", "timestamp": "2025-09-05 09:10:38.386248", "step": 2800, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:38.566979", "step": 2800, "epoch": 3 }, { "type": "loss", "content": 0.2466062307357788, "timestamp": "2025-09-05 09:10:38.574310", "step": 2801, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:38.797138", "step": 2801, "epoch": 3 }, { "type": "loss", "content": 0.297829270362854, "timestamp": "2025-09-05 09:10:38.801752", "step": 2802, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:39.027938", "step": 2802, "epoch": 3 }, { "type": "loss", "content": 0.3962607681751251, "timestamp": "2025-09-05 09:10:39.037773", "step": 2803, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:39.256098", "step": 2803, "epoch": 3 }, { "type": "loss", "content": 0.23827433586120605, "timestamp": "2025-09-05 09:10:39.273813", "step": 2804, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:39.486250", "step": 2804, "epoch": 3 }, { "type": "loss", "content": 0.34505435824394226, "timestamp": "2025-09-05 09:10:39.498012", "step": 2805, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:39.701452", "step": 2805, "epoch": 3 }, { "type": "loss", "content": 0.17793166637420654, "timestamp": "2025-09-05 09:10:39.706284", "step": 2806, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:39.936500", "step": 2806, "epoch": 3 }, { "type": "loss", "content": 0.387498140335083, "timestamp": "2025-09-05 09:10:39.943246", "step": 2807, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:40.141959", "step": 2807, "epoch": 3 }, { "type": "loss", "content": 0.25481945276260376, "timestamp": "2025-09-05 09:10:40.161190", "step": 2808, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:40.369536", "step": 2808, "epoch": 3 }, { "type": "loss", "content": 0.22373159229755402, "timestamp": "2025-09-05 09:10:40.373589", "step": 2809, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:40.584720", "step": 2809, "epoch": 3 }, { "type": "loss", "content": 0.2652718126773834, "timestamp": "2025-09-05 09:10:40.592769", "step": 2810, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:40.811016", "step": 2810, "epoch": 3 }, { "type": "loss", "content": 0.24120259284973145, "timestamp": "2025-09-05 09:10:40.821878", "step": 2811, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:40.994024", "step": 2811, "epoch": 3 }, { "type": "loss", "content": 0.29725131392478943, "timestamp": "2025-09-05 09:10:41.010223", "step": 2812, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:41.187657", "step": 2812, "epoch": 3 }, { "type": "loss", "content": 0.24363994598388672, "timestamp": "2025-09-05 09:10:41.193535", "step": 2813, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:41.369512", "step": 2813, "epoch": 3 }, { "type": "loss", "content": 0.29551854729652405, "timestamp": "2025-09-05 09:10:41.374421", "step": 2814, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:41.645614", "step": 2814, "epoch": 3 }, { "type": "loss", "content": 0.3614238202571869, "timestamp": "2025-09-05 09:10:41.649105", "step": 2815, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:41.827736", "step": 2815, "epoch": 3 }, { "type": "loss", "content": 0.4348675310611725, "timestamp": "2025-09-05 09:10:41.847192", "step": 2816, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:42.024160", "step": 2816, "epoch": 3 }, { "type": "loss", "content": 0.26274287700653076, "timestamp": "2025-09-05 09:10:42.026226", "step": 2817, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:42.201657", "step": 2817, "epoch": 3 }, { "type": "loss", "content": 0.28489479422569275, "timestamp": "2025-09-05 09:10:42.204164", "step": 2818, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:42.382200", "step": 2818, "epoch": 3 }, { "type": "loss", "content": 0.3202391564846039, "timestamp": "2025-09-05 09:10:42.385910", "step": 2819, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:42.566485", "step": 2819, "epoch": 3 }, { "type": "loss", "content": 0.31756865978240967, "timestamp": "2025-09-05 09:10:42.583875", "step": 2820, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:10:48.047056", "step": 2820, "epoch": 3 }, { "type": "pplx", "content": 57.04747695124036, "timestamp": "2025-09-05 09:10:48.051916", "step": 2820, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:48.232338", "step": 2820, "epoch": 3 }, { "type": "loss", "content": 0.2091667205095291, "timestamp": "2025-09-05 09:10:48.235792", "step": 2821, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:48.446456", "step": 2821, "epoch": 3 }, { "type": "loss", "content": 0.39467981457710266, "timestamp": "2025-09-05 09:10:48.449068", "step": 2822, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:48.647493", "step": 2822, "epoch": 3 }, { "type": "loss", "content": 0.2453453540802002, "timestamp": "2025-09-05 09:10:48.651805", "step": 2823, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:48.851775", "step": 2823, "epoch": 3 }, { "type": "loss", "content": 0.3213421404361725, "timestamp": "2025-09-05 09:10:48.870243", "step": 2824, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:49.067797", "step": 2824, "epoch": 3 }, { "type": "loss", "content": 0.360073447227478, "timestamp": "2025-09-05 09:10:49.071550", "step": 2825, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:49.288504", "step": 2825, "epoch": 3 }, { "type": "loss", "content": 0.3350084125995636, "timestamp": "2025-09-05 09:10:49.293974", "step": 2826, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:49.517532", "step": 2826, "epoch": 3 }, { "type": "loss", "content": 0.2850082218647003, "timestamp": "2025-09-05 09:10:49.521509", "step": 2827, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:49.734675", "step": 2827, "epoch": 3 }, { "type": "loss", "content": 0.24017834663391113, "timestamp": "2025-09-05 09:10:49.750677", "step": 2828, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:49.956360", "step": 2828, "epoch": 3 }, { "type": "loss", "content": 0.38482263684272766, "timestamp": "2025-09-05 09:10:49.958550", "step": 2829, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:50.165235", "step": 2829, "epoch": 3 }, { "type": "loss", "content": 0.382106751203537, "timestamp": "2025-09-05 09:10:50.169821", "step": 2830, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:50.376138", "step": 2830, "epoch": 3 }, { "type": "loss", "content": 0.2114604413509369, "timestamp": "2025-09-05 09:10:50.378830", "step": 2831, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:50.589159", "step": 2831, "epoch": 3 }, { "type": "loss", "content": 0.2949742078781128, "timestamp": "2025-09-05 09:10:50.605645", "step": 2832, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:50.844757", "step": 2832, "epoch": 3 }, { "type": "loss", "content": 0.19070105254650116, "timestamp": "2025-09-05 09:10:50.847226", "step": 2833, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:51.046271", "step": 2833, "epoch": 3 }, { "type": "loss", "content": 0.23265816271305084, "timestamp": "2025-09-05 09:10:51.049494", "step": 2834, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:51.214113", "step": 2834, "epoch": 3 }, { "type": "loss", "content": 0.3064287602901459, "timestamp": "2025-09-05 09:10:51.221071", "step": 2835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:51.431369", "step": 2835, "epoch": 3 }, { "type": "loss", "content": 0.33445465564727783, "timestamp": "2025-09-05 09:10:51.446234", "step": 2836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:51.644763", "step": 2836, "epoch": 3 }, { "type": "loss", "content": 0.2877272665500641, "timestamp": "2025-09-05 09:10:51.646928", "step": 2837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:10:51.846453", "step": 2837, "epoch": 3 }, { "type": "loss", "content": 0.2460276037454605, "timestamp": "2025-09-05 09:10:51.851098", "step": 2838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:52.053185", "step": 2838, "epoch": 3 }, { "type": "loss", "content": 0.47292837500572205, "timestamp": "2025-09-05 09:10:52.056120", "step": 2839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:52.253361", "step": 2839, "epoch": 3 }, { "type": "loss", "content": 0.23295848071575165, "timestamp": "2025-09-05 09:10:52.272148", "step": 2840, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:10:57.469162", "step": 2840, "epoch": 3 }, { "type": "pplx", "content": 56.21594373110914, "timestamp": "2025-09-05 09:10:57.473155", "step": 2840, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2840", "timestamp": "2025-09-05 09:10:57.984109", "step": 2840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:58.122877", "step": 2840, "epoch": 3 }, { "type": "loss", "content": 0.31002530455589294, "timestamp": "2025-09-05 09:10:58.127195", "step": 2841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:58.305422", "step": 2841, "epoch": 3 }, { "type": "loss", "content": 0.28681862354278564, "timestamp": "2025-09-05 09:10:58.350144", "step": 2842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:58.535226", "step": 2842, "epoch": 3 }, { "type": "loss", "content": 0.44048944115638733, "timestamp": "2025-09-05 09:10:58.576570", "step": 2843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:58.866502", "step": 2843, "epoch": 3 }, { "type": "loss", "content": 0.4286167025566101, "timestamp": "2025-09-05 09:10:58.881406", "step": 2844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:10:59.144763", "step": 2844, "epoch": 3 }, { "type": "loss", "content": 0.28463494777679443, "timestamp": "2025-09-05 09:10:59.149799", "step": 2845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:10:59.319538", "step": 2845, "epoch": 3 }, { "type": "loss", "content": 0.2371792048215866, "timestamp": "2025-09-05 09:10:59.321865", "step": 2846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:59.494662", "step": 2846, "epoch": 3 }, { "type": "loss", "content": 0.27211883664131165, "timestamp": "2025-09-05 09:10:59.525047", "step": 2847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:10:59.758416", "step": 2847, "epoch": 3 }, { "type": "loss", "content": 0.2545848488807678, "timestamp": "2025-09-05 09:10:59.776991", "step": 2848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:10:59.931962", "step": 2848, "epoch": 3 }, { "type": "loss", "content": 0.24429751932621002, "timestamp": "2025-09-05 09:10:59.935000", "step": 2849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:00.096442", "step": 2849, "epoch": 3 }, { "type": "loss", "content": 0.20609146356582642, "timestamp": "2025-09-05 09:11:00.099578", "step": 2850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:00.262502", "step": 2850, "epoch": 3 }, { "type": "loss", "content": 0.274873286485672, "timestamp": "2025-09-05 09:11:00.306226", "step": 2851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:00.478179", "step": 2851, "epoch": 3 }, { "type": "loss", "content": 0.24983830749988556, "timestamp": "2025-09-05 09:11:00.501042", "step": 2852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:00.666751", "step": 2852, "epoch": 3 }, { "type": "loss", "content": 0.2654348611831665, "timestamp": "2025-09-05 09:11:00.670801", "step": 2853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:00.854654", "step": 2853, "epoch": 3 }, { "type": "loss", "content": 0.2965334951877594, "timestamp": "2025-09-05 09:11:00.857254", "step": 2854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:01.111321", "step": 2854, "epoch": 3 }, { "type": "loss", "content": 0.5099512338638306, "timestamp": "2025-09-05 09:11:01.114036", "step": 2855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:11:01.289857", "step": 2855, "epoch": 3 }, { "type": "loss", "content": 0.27999117970466614, "timestamp": "2025-09-05 09:11:01.305114", "step": 2856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:01.460920", "step": 2856, "epoch": 3 }, { "type": "loss", "content": 0.4357217848300934, "timestamp": "2025-09-05 09:11:01.463101", "step": 2857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:01.637596", "step": 2857, "epoch": 3 }, { "type": "loss", "content": 0.304353266954422, "timestamp": "2025-09-05 09:11:01.640459", "step": 2858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:01.867325", "step": 2858, "epoch": 3 }, { "type": "loss", "content": 0.3254320025444031, "timestamp": "2025-09-05 09:11:01.870492", "step": 2859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:02.041130", "step": 2859, "epoch": 3 }, { "type": "loss", "content": 0.42343053221702576, "timestamp": "2025-09-05 09:11:02.056588", "step": 2860, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:11:07.149407", "step": 2860, "epoch": 3 }, { "type": "pplx", "content": 56.447537887231604, "timestamp": "2025-09-05 09:11:07.153770", "step": 2860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:07.324510", "step": 2860, "epoch": 3 }, { "type": "loss", "content": 0.2908163368701935, "timestamp": "2025-09-05 09:11:07.329103", "step": 2861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:07.535867", "step": 2861, "epoch": 3 }, { "type": "loss", "content": 0.272657185792923, "timestamp": "2025-09-05 09:11:07.538006", "step": 2862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:07.788214", "step": 2862, "epoch": 3 }, { "type": "loss", "content": 0.30033841729164124, "timestamp": "2025-09-05 09:11:07.790608", "step": 2863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:07.999472", "step": 2863, "epoch": 3 }, { "type": "loss", "content": 0.3589075207710266, "timestamp": "2025-09-05 09:11:08.008266", "step": 2864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:08.174062", "step": 2864, "epoch": 3 }, { "type": "loss", "content": 0.2896050810813904, "timestamp": "2025-09-05 09:11:08.177026", "step": 2865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:08.385757", "step": 2865, "epoch": 3 }, { "type": "loss", "content": 0.303393691778183, "timestamp": "2025-09-05 09:11:08.390120", "step": 2866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:08.567458", "step": 2866, "epoch": 3 }, { "type": "loss", "content": 0.2631215453147888, "timestamp": "2025-09-05 09:11:08.570872", "step": 2867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:08.732555", "step": 2867, "epoch": 3 }, { "type": "loss", "content": 0.17335376143455505, "timestamp": "2025-09-05 09:11:08.749457", "step": 2868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:08.917535", "step": 2868, "epoch": 3 }, { "type": "loss", "content": 0.3204255998134613, "timestamp": "2025-09-05 09:11:08.920552", "step": 2869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:09.083644", "step": 2869, "epoch": 3 }, { "type": "loss", "content": 0.2697032392024994, "timestamp": "2025-09-05 09:11:09.117302", "step": 2870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:09.304974", "step": 2870, "epoch": 3 }, { "type": "loss", "content": 0.2992748022079468, "timestamp": "2025-09-05 09:11:09.307880", "step": 2871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:09.489092", "step": 2871, "epoch": 3 }, { "type": "loss", "content": 0.4421190619468689, "timestamp": "2025-09-05 09:11:09.503449", "step": 2872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:09.655522", "step": 2872, "epoch": 3 }, { "type": "loss", "content": 0.3196788728237152, "timestamp": "2025-09-05 09:11:09.657440", "step": 2873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:09.845994", "step": 2873, "epoch": 3 }, { "type": "loss", "content": 0.35318523645401, "timestamp": "2025-09-05 09:11:09.848550", "step": 2874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:10.074877", "step": 2874, "epoch": 3 }, { "type": "loss", "content": 0.2514253556728363, "timestamp": "2025-09-05 09:11:10.078009", "step": 2875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:10.251551", "step": 2875, "epoch": 3 }, { "type": "loss", "content": 0.26804113388061523, "timestamp": "2025-09-05 09:11:10.265438", "step": 2876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:10.417155", "step": 2876, "epoch": 3 }, { "type": "loss", "content": 0.4025658071041107, "timestamp": "2025-09-05 09:11:10.419938", "step": 2877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:10.590848", "step": 2877, "epoch": 3 }, { "type": "loss", "content": 0.30375415086746216, "timestamp": "2025-09-05 09:11:10.593530", "step": 2878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:10.848347", "step": 2878, "epoch": 3 }, { "type": "loss", "content": 0.20703478157520294, "timestamp": "2025-09-05 09:11:10.850887", "step": 2879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:11.077416", "step": 2879, "epoch": 3 }, { "type": "loss", "content": 0.2653372287750244, "timestamp": "2025-09-05 09:11:11.094236", "step": 2880, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:11:16.273351", "step": 2880, "epoch": 3 }, { "type": "pplx", "content": 56.14329639107681, "timestamp": "2025-09-05 09:11:16.275174", "step": 2880, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2880", "timestamp": "2025-09-05 09:11:16.727654", "step": 2880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:16.902534", "step": 2880, "epoch": 3 }, { "type": "loss", "content": 0.18782301247119904, "timestamp": "2025-09-05 09:11:16.904887", "step": 2881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:17.108973", "step": 2881, "epoch": 3 }, { "type": "loss", "content": 0.46404707431793213, "timestamp": "2025-09-05 09:11:17.111598", "step": 2882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:17.330037", "step": 2882, "epoch": 3 }, { "type": "loss", "content": 0.23860648274421692, "timestamp": "2025-09-05 09:11:17.332733", "step": 2883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:17.579843", "step": 2883, "epoch": 3 }, { "type": "loss", "content": 0.3546093702316284, "timestamp": "2025-09-05 09:11:17.594163", "step": 2884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:17.783185", "step": 2884, "epoch": 3 }, { "type": "loss", "content": 0.2759442627429962, "timestamp": "2025-09-05 09:11:17.785356", "step": 2885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:11:17.981116", "step": 2885, "epoch": 3 }, { "type": "loss", "content": 0.28804221749305725, "timestamp": "2025-09-05 09:11:17.983364", "step": 2886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:18.178205", "step": 2886, "epoch": 3 }, { "type": "loss", "content": 0.2742845416069031, "timestamp": "2025-09-05 09:11:18.180403", "step": 2887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:18.385459", "step": 2887, "epoch": 3 }, { "type": "loss", "content": 0.2764730155467987, "timestamp": "2025-09-05 09:11:18.394721", "step": 2888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:18.557787", "step": 2888, "epoch": 3 }, { "type": "loss", "content": 0.29613184928894043, "timestamp": "2025-09-05 09:11:18.559668", "step": 2889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:18.763335", "step": 2889, "epoch": 3 }, { "type": "loss", "content": 0.3713921010494232, "timestamp": "2025-09-05 09:11:18.765453", "step": 2890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:18.969399", "step": 2890, "epoch": 3 }, { "type": "loss", "content": 0.3053815960884094, "timestamp": "2025-09-05 09:11:18.971277", "step": 2891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:19.176772", "step": 2891, "epoch": 3 }, { "type": "loss", "content": 0.2894931137561798, "timestamp": "2025-09-05 09:11:19.190110", "step": 2892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:11:19.381008", "step": 2892, "epoch": 3 }, { "type": "loss", "content": 0.449165016412735, "timestamp": "2025-09-05 09:11:19.383226", "step": 2893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:19.675218", "step": 2893, "epoch": 3 }, { "type": "loss", "content": 0.28056976199150085, "timestamp": "2025-09-05 09:11:19.677221", "step": 2894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:19.884855", "step": 2894, "epoch": 3 }, { "type": "loss", "content": 0.40280672907829285, "timestamp": "2025-09-05 09:11:19.887529", "step": 2895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:20.083466", "step": 2895, "epoch": 3 }, { "type": "loss", "content": 0.3817463219165802, "timestamp": "2025-09-05 09:11:20.097033", "step": 2896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:20.289971", "step": 2896, "epoch": 3 }, { "type": "loss", "content": 0.38125133514404297, "timestamp": "2025-09-05 09:11:20.292065", "step": 2897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:20.487917", "step": 2897, "epoch": 3 }, { "type": "loss", "content": 0.29178139567375183, "timestamp": "2025-09-05 09:11:20.489961", "step": 2898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:20.694429", "step": 2898, "epoch": 3 }, { "type": "loss", "content": 0.41615185141563416, "timestamp": "2025-09-05 09:11:20.696952", "step": 2899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:20.894823", "step": 2899, "epoch": 3 }, { "type": "loss", "content": 0.2605353593826294, "timestamp": "2025-09-05 09:11:20.950986", "step": 2900, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:11:27.057789", "step": 2900, "epoch": 3 }, { "type": "pplx", "content": 55.594667031207955, "timestamp": "2025-09-05 09:11:27.061928", "step": 2900, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:11:27.195690", "step": 2900, "epoch": 3 }, { "type": "loss", "content": 0.3752339482307434, "timestamp": "2025-09-05 09:11:27.198217", "step": 2901, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:27.340205", "step": 2901, "epoch": 3 }, { "type": "loss", "content": 0.2262498438358307, "timestamp": "2025-09-05 09:11:27.342415", "step": 2902, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:27.478265", "step": 2902, "epoch": 3 }, { "type": "loss", "content": 0.2441215217113495, "timestamp": "2025-09-05 09:11:27.480989", "step": 2903, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:27.661026", "step": 2903, "epoch": 3 }, { "type": "loss", "content": 0.32166409492492676, "timestamp": "2025-09-05 09:11:27.670179", "step": 2904, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:27.804903", "step": 2904, "epoch": 3 }, { "type": "loss", "content": 0.3819389045238495, "timestamp": "2025-09-05 09:11:27.806864", "step": 2905, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:11:27.941693", "step": 2905, "epoch": 3 }, { "type": "loss", "content": 0.3978331387042999, "timestamp": "2025-09-05 09:11:27.944445", "step": 2906, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:28.082659", "step": 2906, "epoch": 3 }, { "type": "loss", "content": 0.16381220519542694, "timestamp": "2025-09-05 09:11:28.085360", "step": 2907, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:28.311273", "step": 2907, "epoch": 3 }, { "type": "loss", "content": 0.29129958152770996, "timestamp": "2025-09-05 09:11:28.327758", "step": 2908, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:28.495832", "step": 2908, "epoch": 3 }, { "type": "loss", "content": 0.24345554411411285, "timestamp": "2025-09-05 09:11:28.498309", "step": 2909, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:28.659067", "step": 2909, "epoch": 3 }, { "type": "loss", "content": 0.3856002986431122, "timestamp": "2025-09-05 09:11:28.661532", "step": 2910, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:29.055825", "step": 2910, "epoch": 3 }, { "type": "loss", "content": 0.32962942123413086, "timestamp": "2025-09-05 09:11:29.058096", "step": 2911, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:11:29.295803", "step": 2911, "epoch": 3 }, { "type": "loss", "content": 0.4046390652656555, "timestamp": "2025-09-05 09:11:29.311015", "step": 2912, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:29.465817", "step": 2912, "epoch": 3 }, { "type": "loss", "content": 0.26469311118125916, "timestamp": "2025-09-05 09:11:29.467655", "step": 2913, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:29.635648", "step": 2913, "epoch": 3 }, { "type": "loss", "content": 0.2808852791786194, "timestamp": "2025-09-05 09:11:29.637811", "step": 2914, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:29.810312", "step": 2914, "epoch": 3 }, { "type": "loss", "content": 0.3875598907470703, "timestamp": "2025-09-05 09:11:29.812426", "step": 2915, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:29.974711", "step": 2915, "epoch": 3 }, { "type": "loss", "content": 0.23085609078407288, "timestamp": "2025-09-05 09:11:29.991556", "step": 2916, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:30.257525", "step": 2916, "epoch": 3 }, { "type": "loss", "content": 0.24958936870098114, "timestamp": "2025-09-05 09:11:30.300724", "step": 2917, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:30.744519", "step": 2917, "epoch": 3 }, { "type": "loss", "content": 0.1771358847618103, "timestamp": "2025-09-05 09:11:30.774755", "step": 2918, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:31.008046", "step": 2918, "epoch": 3 }, { "type": "loss", "content": 0.22553907334804535, "timestamp": "2025-09-05 09:11:31.010166", "step": 2919, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:31.185841", "step": 2919, "epoch": 3 }, { "type": "loss", "content": 0.24437619745731354, "timestamp": "2025-09-05 09:11:31.203679", "step": 2920, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:11:36.728102", "step": 2920, "epoch": 3 }, { "type": "pplx", "content": 54.64921801749989, "timestamp": "2025-09-05 09:11:36.770957", "step": 2920, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2920", "timestamp": "2025-09-05 09:11:37.232785", "step": 2920, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:37.417932", "step": 2920, "epoch": 3 }, { "type": "loss", "content": 0.21169906854629517, "timestamp": "2025-09-05 09:11:37.420206", "step": 2921, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:37.614738", "step": 2921, "epoch": 3 }, { "type": "loss", "content": 0.2853226363658905, "timestamp": "2025-09-05 09:11:37.621981", "step": 2922, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:37.875781", "step": 2922, "epoch": 3 }, { "type": "loss", "content": 0.43432602286338806, "timestamp": "2025-09-05 09:11:37.878382", "step": 2923, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:38.169912", "step": 2923, "epoch": 3 }, { "type": "loss", "content": 0.340687096118927, "timestamp": "2025-09-05 09:11:38.184822", "step": 2924, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:38.404087", "step": 2924, "epoch": 3 }, { "type": "loss", "content": 0.23280911147594452, "timestamp": "2025-09-05 09:11:38.423992", "step": 2925, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:38.717427", "step": 2925, "epoch": 3 }, { "type": "loss", "content": 0.3444267511367798, "timestamp": "2025-09-05 09:11:38.725155", "step": 2926, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:11:39.030007", "step": 2926, "epoch": 3 }, { "type": "loss", "content": 0.49055567383766174, "timestamp": "2025-09-05 09:11:39.032125", "step": 2927, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:39.236375", "step": 2927, "epoch": 3 }, { "type": "loss", "content": 0.15985403954982758, "timestamp": "2025-09-05 09:11:39.249699", "step": 2928, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:39.494175", "step": 2928, "epoch": 3 }, { "type": "loss", "content": 0.3456610441207886, "timestamp": "2025-09-05 09:11:39.537446", "step": 2929, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:39.746492", "step": 2929, "epoch": 3 }, { "type": "loss", "content": 0.3735094368457794, "timestamp": "2025-09-05 09:11:39.748659", "step": 2930, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:39.951823", "step": 2930, "epoch": 3 }, { "type": "loss", "content": 0.24350151419639587, "timestamp": "2025-09-05 09:11:39.954047", "step": 2931, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:40.120027", "step": 2931, "epoch": 3 }, { "type": "loss", "content": 0.16938263177871704, "timestamp": "2025-09-05 09:11:40.142112", "step": 2932, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:40.415486", "step": 2932, "epoch": 3 }, { "type": "loss", "content": 0.2621157169342041, "timestamp": "2025-09-05 09:11:40.417763", "step": 2933, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:40.629949", "step": 2933, "epoch": 3 }, { "type": "loss", "content": 0.29170507192611694, "timestamp": "2025-09-05 09:11:40.632243", "step": 2934, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:40.840341", "step": 2934, "epoch": 3 }, { "type": "loss", "content": 0.20361952483654022, "timestamp": "2025-09-05 09:11:40.842802", "step": 2935, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:41.091216", "step": 2935, "epoch": 3 }, { "type": "loss", "content": 0.2855290174484253, "timestamp": "2025-09-05 09:11:41.105181", "step": 2936, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:41.293735", "step": 2936, "epoch": 3 }, { "type": "loss", "content": 0.2542724609375, "timestamp": "2025-09-05 09:11:41.295789", "step": 2937, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:11:41.488491", "step": 2937, "epoch": 3 }, { "type": "loss", "content": 0.21698574721813202, "timestamp": "2025-09-05 09:11:41.490288", "step": 2938, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:41.684731", "step": 2938, "epoch": 3 }, { "type": "loss", "content": 0.24226777255535126, "timestamp": "2025-09-05 09:11:41.687400", "step": 2939, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:41.979428", "step": 2939, "epoch": 3 }, { "type": "loss", "content": 0.24583564698696136, "timestamp": "2025-09-05 09:11:41.992509", "step": 2940, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:11:47.784860", "step": 2940, "epoch": 3 }, { "type": "pplx", "content": 54.78391043335974, "timestamp": "2025-09-05 09:11:47.787728", "step": 2940, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:48.031744", "step": 2940, "epoch": 3 }, { "type": "loss", "content": 0.2940543293952942, "timestamp": "2025-09-05 09:11:48.050706", "step": 2941, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:48.259904", "step": 2941, "epoch": 3 }, { "type": "loss", "content": 0.39327362179756165, "timestamp": "2025-09-05 09:11:48.262429", "step": 2942, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:48.460178", "step": 2942, "epoch": 3 }, { "type": "loss", "content": 0.38169828057289124, "timestamp": "2025-09-05 09:11:48.462726", "step": 2943, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:48.669720", "step": 2943, "epoch": 3 }, { "type": "loss", "content": 0.3624424338340759, "timestamp": "2025-09-05 09:11:48.724431", "step": 2944, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:48.968080", "step": 2944, "epoch": 3 }, { "type": "loss", "content": 0.1788206547498703, "timestamp": "2025-09-05 09:11:49.012455", "step": 2945, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:49.352156", "step": 2945, "epoch": 3 }, { "type": "loss", "content": 0.32311442494392395, "timestamp": "2025-09-05 09:11:49.354956", "step": 2946, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:49.559728", "step": 2946, "epoch": 3 }, { "type": "loss", "content": 0.15347428619861603, "timestamp": "2025-09-05 09:11:49.562279", "step": 2947, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:49.767876", "step": 2947, "epoch": 3 }, { "type": "loss", "content": 0.3563332259654999, "timestamp": "2025-09-05 09:11:49.825040", "step": 2948, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:50.052862", "step": 2948, "epoch": 3 }, { "type": "loss", "content": 0.13164092600345612, "timestamp": "2025-09-05 09:11:50.055820", "step": 2949, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:50.263077", "step": 2949, "epoch": 3 }, { "type": "loss", "content": 0.41619938611984253, "timestamp": "2025-09-05 09:11:50.265649", "step": 2950, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:50.462108", "step": 2950, "epoch": 3 }, { "type": "loss", "content": 0.31873831152915955, "timestamp": "2025-09-05 09:11:50.500202", "step": 2951, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:50.748488", "step": 2951, "epoch": 3 }, { "type": "loss", "content": 0.38107752799987793, "timestamp": "2025-09-05 09:11:50.762883", "step": 2952, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:50.951238", "step": 2952, "epoch": 3 }, { "type": "loss", "content": 0.2561323344707489, "timestamp": "2025-09-05 09:11:50.953267", "step": 2953, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:51.157835", "step": 2953, "epoch": 3 }, { "type": "loss", "content": 0.42956051230430603, "timestamp": "2025-09-05 09:11:51.200100", "step": 2954, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:51.449318", "step": 2954, "epoch": 3 }, { "type": "loss", "content": 0.328776091337204, "timestamp": "2025-09-05 09:11:51.451728", "step": 2955, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:51.656524", "step": 2955, "epoch": 3 }, { "type": "loss", "content": 0.3320053815841675, "timestamp": "2025-09-05 09:11:51.670659", "step": 2956, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:11:51.863783", "step": 2956, "epoch": 3 }, { "type": "loss", "content": 0.42864733934402466, "timestamp": "2025-09-05 09:11:51.866444", "step": 2957, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:52.114254", "step": 2957, "epoch": 3 }, { "type": "loss", "content": 0.35642778873443604, "timestamp": "2025-09-05 09:11:52.158212", "step": 2958, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:52.458736", "step": 2958, "epoch": 3 }, { "type": "loss", "content": 0.46565619111061096, "timestamp": "2025-09-05 09:11:52.461733", "step": 2959, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:52.667328", "step": 2959, "epoch": 3 }, { "type": "loss", "content": 0.265375018119812, "timestamp": "2025-09-05 09:11:52.682797", "step": 2960, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:11:58.548505", "step": 2960, "epoch": 3 }, { "type": "pplx", "content": 55.55334624048589, "timestamp": "2025-09-05 09:11:58.550509", "step": 2960, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2960", "timestamp": "2025-09-05 09:11:58.989021", "step": 2960, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:11:59.158088", "step": 2960, "epoch": 3 }, { "type": "loss", "content": 0.38203999400138855, "timestamp": "2025-09-05 09:11:59.160162", "step": 2961, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:59.356216", "step": 2961, "epoch": 3 }, { "type": "loss", "content": 0.2547346353530884, "timestamp": "2025-09-05 09:11:59.357933", "step": 2962, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:11:59.552815", "step": 2962, "epoch": 3 }, { "type": "loss", "content": 0.27306175231933594, "timestamp": "2025-09-05 09:11:59.554906", "step": 2963, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:11:59.751367", "step": 2963, "epoch": 3 }, { "type": "loss", "content": 0.3331190347671509, "timestamp": "2025-09-05 09:11:59.766971", "step": 2964, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:11:59.963517", "step": 2964, "epoch": 3 }, { "type": "loss", "content": 0.47358471155166626, "timestamp": "2025-09-05 09:11:59.965670", "step": 2965, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:00.174056", "step": 2965, "epoch": 3 }, { "type": "loss", "content": 0.2792886197566986, "timestamp": "2025-09-05 09:12:00.175714", "step": 2966, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:00.370898", "step": 2966, "epoch": 3 }, { "type": "loss", "content": 0.32101067900657654, "timestamp": "2025-09-05 09:12:00.372845", "step": 2967, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:00.538749", "step": 2967, "epoch": 3 }, { "type": "loss", "content": 0.27621304988861084, "timestamp": "2025-09-05 09:12:00.552344", "step": 2968, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:00.741327", "step": 2968, "epoch": 3 }, { "type": "loss", "content": 0.415333092212677, "timestamp": "2025-09-05 09:12:00.743833", "step": 2969, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:00.950697", "step": 2969, "epoch": 3 }, { "type": "loss", "content": 0.26739320158958435, "timestamp": "2025-09-05 09:12:00.952648", "step": 2970, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:12:01.159894", "step": 2970, "epoch": 3 }, { "type": "loss", "content": 0.24618351459503174, "timestamp": "2025-09-05 09:12:01.162045", "step": 2971, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:01.324984", "step": 2971, "epoch": 3 }, { "type": "loss", "content": 0.20293676853179932, "timestamp": "2025-09-05 09:12:01.342168", "step": 2972, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:01.538887", "step": 2972, "epoch": 3 }, { "type": "loss", "content": 0.36944884061813354, "timestamp": "2025-09-05 09:12:01.540964", "step": 2973, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:01.738840", "step": 2973, "epoch": 3 }, { "type": "loss", "content": 0.28939300775527954, "timestamp": "2025-09-05 09:12:01.743461", "step": 2974, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:01.953118", "step": 2974, "epoch": 3 }, { "type": "loss", "content": 0.37455692887306213, "timestamp": "2025-09-05 09:12:01.955446", "step": 2975, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:02.165822", "step": 2975, "epoch": 3 }, { "type": "loss", "content": 0.3490968644618988, "timestamp": "2025-09-05 09:12:02.179427", "step": 2976, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:02.375029", "step": 2976, "epoch": 3 }, { "type": "loss", "content": 0.375367134809494, "timestamp": "2025-09-05 09:12:02.378208", "step": 2977, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:02.574703", "step": 2977, "epoch": 3 }, { "type": "loss", "content": 0.2835385799407959, "timestamp": "2025-09-05 09:12:02.576904", "step": 2978, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:02.771920", "step": 2978, "epoch": 3 }, { "type": "loss", "content": 0.39227986335754395, "timestamp": "2025-09-05 09:12:02.774121", "step": 2979, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:02.968704", "step": 2979, "epoch": 3 }, { "type": "loss", "content": 0.29060855507850647, "timestamp": "2025-09-05 09:12:02.985594", "step": 2980, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:12:07.636923", "step": 2980, "epoch": 3 }, { "type": "pplx", "content": 54.606959405023254, "timestamp": "2025-09-05 09:12:07.638898", "step": 2980, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:07.798582", "step": 2980, "epoch": 3 }, { "type": "loss", "content": 0.20793351531028748, "timestamp": "2025-09-05 09:12:07.800534", "step": 2981, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:07.965219", "step": 2981, "epoch": 3 }, { "type": "loss", "content": 0.2619383931159973, "timestamp": "2025-09-05 09:12:07.967179", "step": 2982, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:08.171327", "step": 2982, "epoch": 3 }, { "type": "loss", "content": 0.27213922142982483, "timestamp": "2025-09-05 09:12:08.173305", "step": 2983, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:08.376566", "step": 2983, "epoch": 3 }, { "type": "loss", "content": 0.2601911425590515, "timestamp": "2025-09-05 09:12:08.385366", "step": 2984, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:08.547557", "step": 2984, "epoch": 3 }, { "type": "loss", "content": 0.2833835184574127, "timestamp": "2025-09-05 09:12:08.549483", "step": 2985, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:08.754222", "step": 2985, "epoch": 3 }, { "type": "loss", "content": 0.40964579582214355, "timestamp": "2025-09-05 09:12:08.756656", "step": 2986, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:08.961846", "step": 2986, "epoch": 3 }, { "type": "loss", "content": 0.41075843572616577, "timestamp": "2025-09-05 09:12:08.964043", "step": 2987, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:09.168674", "step": 2987, "epoch": 3 }, { "type": "loss", "content": 0.43984243273735046, "timestamp": "2025-09-05 09:12:09.182401", "step": 2988, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:09.368376", "step": 2988, "epoch": 3 }, { "type": "loss", "content": 0.40275534987449646, "timestamp": "2025-09-05 09:12:09.370394", "step": 2989, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:09.574501", "step": 2989, "epoch": 3 }, { "type": "loss", "content": 0.32596346735954285, "timestamp": "2025-09-05 09:12:09.576459", "step": 2990, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:09.741903", "step": 2990, "epoch": 3 }, { "type": "loss", "content": 0.3702400326728821, "timestamp": "2025-09-05 09:12:09.744360", "step": 2991, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:09.934911", "step": 2991, "epoch": 3 }, { "type": "loss", "content": 0.33719366788864136, "timestamp": "2025-09-05 09:12:09.943923", "step": 2992, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:10.102127", "step": 2992, "epoch": 3 }, { "type": "loss", "content": 0.2717023491859436, "timestamp": "2025-09-05 09:12:10.104021", "step": 2993, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:10.308830", "step": 2993, "epoch": 3 }, { "type": "loss", "content": 0.40143582224845886, "timestamp": "2025-09-05 09:12:10.311152", "step": 2994, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:12:10.503737", "step": 2994, "epoch": 3 }, { "type": "loss", "content": 0.2984316349029541, "timestamp": "2025-09-05 09:12:10.510745", "step": 2995, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:10.706382", "step": 2995, "epoch": 3 }, { "type": "loss", "content": 0.14753331243991852, "timestamp": "2025-09-05 09:12:10.715441", "step": 2996, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:10.876908", "step": 2996, "epoch": 3 }, { "type": "loss", "content": 0.2177143543958664, "timestamp": "2025-09-05 09:12:10.879118", "step": 2997, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:11.082860", "step": 2997, "epoch": 3 }, { "type": "loss", "content": 0.39515912532806396, "timestamp": "2025-09-05 09:12:11.085024", "step": 2998, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:11.252542", "step": 2998, "epoch": 3 }, { "type": "loss", "content": 0.3414614498615265, "timestamp": "2025-09-05 09:12:11.254655", "step": 2999, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:11.448211", "step": 2999, "epoch": 3 }, { "type": "loss", "content": 0.34329718351364136, "timestamp": "2025-09-05 09:12:11.462044", "step": 3000, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:12:16.107356", "step": 3000, "epoch": 3 }, { "type": "pplx", "content": 53.10410466038564, "timestamp": "2025-09-05 09:12:16.109410", "step": 3000, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3000", "timestamp": "2025-09-05 09:12:16.567964", "step": 3000, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:16.746854", "step": 3000, "epoch": 3 }, { "type": "loss", "content": 0.5219593048095703, "timestamp": "2025-09-05 09:12:16.748733", "step": 3001, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:16.914361", "step": 3001, "epoch": 3 }, { "type": "loss", "content": 0.22384533286094666, "timestamp": "2025-09-05 09:12:16.916455", "step": 3002, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:17.118982", "step": 3002, "epoch": 3 }, { "type": "loss", "content": 0.35147175192832947, "timestamp": "2025-09-05 09:12:17.121318", "step": 3003, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:17.324431", "step": 3003, "epoch": 3 }, { "type": "loss", "content": 0.3589687943458557, "timestamp": "2025-09-05 09:12:17.340452", "step": 3004, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:17.535790", "step": 3004, "epoch": 3 }, { "type": "loss", "content": 0.27491340041160583, "timestamp": "2025-09-05 09:12:17.538762", "step": 3005, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:12:17.732860", "step": 3005, "epoch": 3 }, { "type": "loss", "content": 0.2931725084781647, "timestamp": "2025-09-05 09:12:17.735179", "step": 3006, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:17.937747", "step": 3006, "epoch": 3 }, { "type": "loss", "content": 0.299150675535202, "timestamp": "2025-09-05 09:12:17.940089", "step": 3007, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:18.105046", "step": 3007, "epoch": 3 }, { "type": "loss", "content": 0.30979087948799133, "timestamp": "2025-09-05 09:12:18.120936", "step": 3008, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:18.317017", "step": 3008, "epoch": 3 }, { "type": "loss", "content": 0.26512518525123596, "timestamp": "2025-09-05 09:12:18.319197", "step": 3009, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:18.513289", "step": 3009, "epoch": 3 }, { "type": "loss", "content": 0.1896071434020996, "timestamp": "2025-09-05 09:12:18.515493", "step": 3010, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:18.711265", "step": 3010, "epoch": 3 }, { "type": "loss", "content": 0.3693452477455139, "timestamp": "2025-09-05 09:12:18.713314", "step": 3011, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:18.915519", "step": 3011, "epoch": 3 }, { "type": "loss", "content": 0.2476043701171875, "timestamp": "2025-09-05 09:12:18.929455", "step": 3012, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:19.117546", "step": 3012, "epoch": 3 }, { "type": "loss", "content": 0.3103981912136078, "timestamp": "2025-09-05 09:12:19.119840", "step": 3013, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:19.324252", "step": 3013, "epoch": 3 }, { "type": "loss", "content": 0.38308852910995483, "timestamp": "2025-09-05 09:12:19.326376", "step": 3014, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:19.530283", "step": 3014, "epoch": 3 }, { "type": "loss", "content": 0.27662599086761475, "timestamp": "2025-09-05 09:12:19.532388", "step": 3015, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:19.725363", "step": 3015, "epoch": 3 }, { "type": "loss", "content": 0.2989875376224518, "timestamp": "2025-09-05 09:12:19.739697", "step": 3016, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:12:19.924782", "step": 3016, "epoch": 3 }, { "type": "loss", "content": 0.2708384394645691, "timestamp": "2025-09-05 09:12:19.927781", "step": 3017, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:20.122071", "step": 3017, "epoch": 3 }, { "type": "loss", "content": 0.2433338612318039, "timestamp": "2025-09-05 09:12:20.124302", "step": 3018, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:20.288372", "step": 3018, "epoch": 3 }, { "type": "loss", "content": 0.304372102022171, "timestamp": "2025-09-05 09:12:20.291067", "step": 3019, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:20.495596", "step": 3019, "epoch": 3 }, { "type": "loss", "content": 0.27465543150901794, "timestamp": "2025-09-05 09:12:20.509765", "step": 3020, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:12:25.167359", "step": 3020, "epoch": 3 }, { "type": "pplx", "content": 52.50252237851087, "timestamp": "2025-09-05 09:12:25.169469", "step": 3020, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:25.330830", "step": 3020, "epoch": 3 }, { "type": "loss", "content": 0.2500014305114746, "timestamp": "2025-09-05 09:12:25.332814", "step": 3021, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:25.496063", "step": 3021, "epoch": 3 }, { "type": "loss", "content": 0.2104569673538208, "timestamp": "2025-09-05 09:12:25.498366", "step": 3022, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:25.701436", "step": 3022, "epoch": 3 }, { "type": "loss", "content": 0.1959143728017807, "timestamp": "2025-09-05 09:12:25.703511", "step": 3023, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:25.909136", "step": 3023, "epoch": 3 }, { "type": "loss", "content": 0.3365468978881836, "timestamp": "2025-09-05 09:12:25.923098", "step": 3024, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:26.118433", "step": 3024, "epoch": 3 }, { "type": "loss", "content": 0.29884475469589233, "timestamp": "2025-09-05 09:12:26.120908", "step": 3025, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:26.316247", "step": 3025, "epoch": 3 }, { "type": "loss", "content": 0.2602010667324066, "timestamp": "2025-09-05 09:12:26.318653", "step": 3026, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:26.524920", "step": 3026, "epoch": 3 }, { "type": "loss", "content": 0.28113895654678345, "timestamp": "2025-09-05 09:12:26.527074", "step": 3027, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:12:26.722996", "step": 3027, "epoch": 3 }, { "type": "loss", "content": 0.34172090888023376, "timestamp": "2025-09-05 09:12:26.741293", "step": 3028, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:26.938961", "step": 3028, "epoch": 3 }, { "type": "loss", "content": 0.401425838470459, "timestamp": "2025-09-05 09:12:26.940826", "step": 3029, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:27.146216", "step": 3029, "epoch": 3 }, { "type": "loss", "content": 0.2900570034980774, "timestamp": "2025-09-05 09:12:27.148185", "step": 3030, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:27.343391", "step": 3030, "epoch": 3 }, { "type": "loss", "content": 0.2659049928188324, "timestamp": "2025-09-05 09:12:27.345656", "step": 3031, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:27.550953", "step": 3031, "epoch": 3 }, { "type": "loss", "content": 0.30906596779823303, "timestamp": "2025-09-05 09:12:27.567599", "step": 3032, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:27.761515", "step": 3032, "epoch": 3 }, { "type": "loss", "content": 0.4450221657752991, "timestamp": "2025-09-05 09:12:27.763636", "step": 3033, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:27.959754", "step": 3033, "epoch": 3 }, { "type": "loss", "content": 0.35517576336860657, "timestamp": "2025-09-05 09:12:27.962312", "step": 3034, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:28.167097", "step": 3034, "epoch": 3 }, { "type": "loss", "content": 0.4547875225543976, "timestamp": "2025-09-05 09:12:28.169352", "step": 3035, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:28.364541", "step": 3035, "epoch": 3 }, { "type": "loss", "content": 0.2977902591228485, "timestamp": "2025-09-05 09:12:28.381078", "step": 3036, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:28.574247", "step": 3036, "epoch": 3 }, { "type": "loss", "content": 0.2584066092967987, "timestamp": "2025-09-05 09:12:28.576226", "step": 3037, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:28.778959", "step": 3037, "epoch": 3 }, { "type": "loss", "content": 0.22676676511764526, "timestamp": "2025-09-05 09:12:28.781181", "step": 3038, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:28.974249", "step": 3038, "epoch": 3 }, { "type": "loss", "content": 0.20184023678302765, "timestamp": "2025-09-05 09:12:28.976433", "step": 3039, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:29.179150", "step": 3039, "epoch": 3 }, { "type": "loss", "content": 0.3374442458152771, "timestamp": "2025-09-05 09:12:29.195816", "step": 3040, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:12:33.868272", "step": 3040, "epoch": 3 }, { "type": "pplx", "content": 52.40073254912315, "timestamp": "2025-09-05 09:12:33.870120", "step": 3040, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3040", "timestamp": "2025-09-05 09:12:34.326846", "step": 3040, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:34.467443", "step": 3040, "epoch": 3 }, { "type": "loss", "content": 0.4248269498348236, "timestamp": "2025-09-05 09:12:34.469714", "step": 3041, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:34.636452", "step": 3041, "epoch": 3 }, { "type": "loss", "content": 0.25288718938827515, "timestamp": "2025-09-05 09:12:34.638663", "step": 3042, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:34.809846", "step": 3042, "epoch": 3 }, { "type": "loss", "content": 0.31233614683151245, "timestamp": "2025-09-05 09:12:34.811951", "step": 3043, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:34.970532", "step": 3043, "epoch": 3 }, { "type": "loss", "content": 0.3236229419708252, "timestamp": "2025-09-05 09:12:34.984081", "step": 3044, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:35.136091", "step": 3044, "epoch": 3 }, { "type": "loss", "content": 0.19996929168701172, "timestamp": "2025-09-05 09:12:35.138576", "step": 3045, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:35.296364", "step": 3045, "epoch": 3 }, { "type": "loss", "content": 0.2629333436489105, "timestamp": "2025-09-05 09:12:35.298689", "step": 3046, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:12:35.468744", "step": 3046, "epoch": 3 }, { "type": "loss", "content": 0.20324602723121643, "timestamp": "2025-09-05 09:12:35.470496", "step": 3047, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:35.628681", "step": 3047, "epoch": 3 }, { "type": "loss", "content": 0.3019815981388092, "timestamp": "2025-09-05 09:12:35.642804", "step": 3048, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:35.797526", "step": 3048, "epoch": 3 }, { "type": "loss", "content": 0.2801806628704071, "timestamp": "2025-09-05 09:12:35.800581", "step": 3049, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:12:35.970991", "step": 3049, "epoch": 3 }, { "type": "loss", "content": 0.27385973930358887, "timestamp": "2025-09-05 09:12:35.973122", "step": 3050, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:36.136878", "step": 3050, "epoch": 3 }, { "type": "loss", "content": 0.30080705881118774, "timestamp": "2025-09-05 09:12:36.139304", "step": 3051, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:36.307841", "step": 3051, "epoch": 3 }, { "type": "loss", "content": 0.3155876398086548, "timestamp": "2025-09-05 09:12:36.324192", "step": 3052, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:36.485707", "step": 3052, "epoch": 3 }, { "type": "loss", "content": 0.3727303445339203, "timestamp": "2025-09-05 09:12:36.488132", "step": 3053, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:36.646680", "step": 3053, "epoch": 3 }, { "type": "loss", "content": 0.21063818037509918, "timestamp": "2025-09-05 09:12:36.648771", "step": 3054, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:36.806161", "step": 3054, "epoch": 3 }, { "type": "loss", "content": 0.37034881114959717, "timestamp": "2025-09-05 09:12:36.808221", "step": 3055, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:36.967297", "step": 3055, "epoch": 3 }, { "type": "loss", "content": 0.3373739719390869, "timestamp": "2025-09-05 09:12:36.980504", "step": 3056, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:37.134015", "step": 3056, "epoch": 3 }, { "type": "loss", "content": 0.2569211721420288, "timestamp": "2025-09-05 09:12:37.136192", "step": 3057, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:12:37.293082", "step": 3057, "epoch": 3 }, { "type": "loss", "content": 0.22303147614002228, "timestamp": "2025-09-05 09:12:37.296720", "step": 3058, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:37.472309", "step": 3058, "epoch": 3 }, { "type": "loss", "content": 0.2923431396484375, "timestamp": "2025-09-05 09:12:37.474200", "step": 3059, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:12:37.634249", "step": 3059, "epoch": 3 }, { "type": "loss", "content": 0.41064414381980896, "timestamp": "2025-09-05 09:12:37.648170", "step": 3060, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:12:42.310853", "step": 3060, "epoch": 3 }, { "type": "pplx", "content": 53.04668741895991, "timestamp": "2025-09-05 09:12:42.313061", "step": 3060, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:42.445731", "step": 3060, "epoch": 3 }, { "type": "loss", "content": 0.2520500719547272, "timestamp": "2025-09-05 09:12:42.448010", "step": 3061, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:42.584143", "step": 3061, "epoch": 3 }, { "type": "loss", "content": 0.33572205901145935, "timestamp": "2025-09-05 09:12:42.588096", "step": 3062, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:42.762657", "step": 3062, "epoch": 3 }, { "type": "loss", "content": 0.2813510298728943, "timestamp": "2025-09-05 09:12:42.764778", "step": 3063, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:42.925306", "step": 3063, "epoch": 3 }, { "type": "loss", "content": 0.23829667270183563, "timestamp": "2025-09-05 09:12:42.941931", "step": 3064, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:43.104263", "step": 3064, "epoch": 3 }, { "type": "loss", "content": 0.2956577241420746, "timestamp": "2025-09-05 09:12:43.106274", "step": 3065, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:43.266859", "step": 3065, "epoch": 3 }, { "type": "loss", "content": 0.24571780860424042, "timestamp": "2025-09-05 09:12:43.269115", "step": 3066, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:43.426658", "step": 3066, "epoch": 3 }, { "type": "loss", "content": 0.17806021869182587, "timestamp": "2025-09-05 09:12:43.429191", "step": 3067, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:43.600917", "step": 3067, "epoch": 3 }, { "type": "loss", "content": 0.2593998908996582, "timestamp": "2025-09-05 09:12:43.609713", "step": 3068, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:43.742291", "step": 3068, "epoch": 3 }, { "type": "loss", "content": 0.34801289439201355, "timestamp": "2025-09-05 09:12:43.744763", "step": 3069, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:43.912633", "step": 3069, "epoch": 3 }, { "type": "loss", "content": 0.5196738839149475, "timestamp": "2025-09-05 09:12:43.914724", "step": 3070, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:44.073133", "step": 3070, "epoch": 3 }, { "type": "loss", "content": 0.3646704852581024, "timestamp": "2025-09-05 09:12:44.075393", "step": 3071, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:44.248356", "step": 3071, "epoch": 3 }, { "type": "loss", "content": 0.3963429033756256, "timestamp": "2025-09-05 09:12:44.265014", "step": 3072, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:44.429735", "step": 3072, "epoch": 3 }, { "type": "loss", "content": 0.265206903219223, "timestamp": "2025-09-05 09:12:44.432399", "step": 3073, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:44.596582", "step": 3073, "epoch": 3 }, { "type": "loss", "content": 0.26692867279052734, "timestamp": "2025-09-05 09:12:44.599011", "step": 3074, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:44.772730", "step": 3074, "epoch": 3 }, { "type": "loss", "content": 0.45934414863586426, "timestamp": "2025-09-05 09:12:44.774557", "step": 3075, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:44.949906", "step": 3075, "epoch": 3 }, { "type": "loss", "content": 0.34467387199401855, "timestamp": "2025-09-05 09:12:44.966102", "step": 3076, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:45.159510", "step": 3076, "epoch": 3 }, { "type": "loss", "content": 0.2720514237880707, "timestamp": "2025-09-05 09:12:45.161795", "step": 3077, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:45.327367", "step": 3077, "epoch": 3 }, { "type": "loss", "content": 0.23640888929367065, "timestamp": "2025-09-05 09:12:45.329368", "step": 3078, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:45.492634", "step": 3078, "epoch": 3 }, { "type": "loss", "content": 0.13110221922397614, "timestamp": "2025-09-05 09:12:45.494825", "step": 3079, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:45.659733", "step": 3079, "epoch": 3 }, { "type": "loss", "content": 0.3650048077106476, "timestamp": "2025-09-05 09:12:45.675789", "step": 3080, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:12:50.314454", "step": 3080, "epoch": 3 }, { "type": "pplx", "content": 53.94136359098753, "timestamp": "2025-09-05 09:12:50.316945", "step": 3080, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3080", "timestamp": "2025-09-05 09:12:50.796727", "step": 3080, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:50.935383", "step": 3080, "epoch": 3 }, { "type": "loss", "content": 0.2984461188316345, "timestamp": "2025-09-05 09:12:50.937442", "step": 3081, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:51.107896", "step": 3081, "epoch": 3 }, { "type": "loss", "content": 0.3399762213230133, "timestamp": "2025-09-05 09:12:51.109895", "step": 3082, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:51.274249", "step": 3082, "epoch": 3 }, { "type": "loss", "content": 0.19832485914230347, "timestamp": "2025-09-05 09:12:51.276230", "step": 3083, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:51.440549", "step": 3083, "epoch": 3 }, { "type": "loss", "content": 0.3151915669441223, "timestamp": "2025-09-05 09:12:51.454599", "step": 3084, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:51.617929", "step": 3084, "epoch": 3 }, { "type": "loss", "content": 0.27868831157684326, "timestamp": "2025-09-05 09:12:51.620026", "step": 3085, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:51.786145", "step": 3085, "epoch": 3 }, { "type": "loss", "content": 0.3083263635635376, "timestamp": "2025-09-05 09:12:51.788444", "step": 3086, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:51.962175", "step": 3086, "epoch": 3 }, { "type": "loss", "content": 0.23392392694950104, "timestamp": "2025-09-05 09:12:51.964192", "step": 3087, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:52.128880", "step": 3087, "epoch": 3 }, { "type": "loss", "content": 0.36243969202041626, "timestamp": "2025-09-05 09:12:52.142581", "step": 3088, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:12:52.301060", "step": 3088, "epoch": 3 }, { "type": "loss", "content": 0.36660972237586975, "timestamp": "2025-09-05 09:12:52.303374", "step": 3089, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:52.467650", "step": 3089, "epoch": 3 }, { "type": "loss", "content": 0.1570238471031189, "timestamp": "2025-09-05 09:12:52.469835", "step": 3090, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:52.643306", "step": 3090, "epoch": 3 }, { "type": "loss", "content": 0.22328078746795654, "timestamp": "2025-09-05 09:12:52.645564", "step": 3091, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:52.810208", "step": 3091, "epoch": 3 }, { "type": "loss", "content": 0.34495118260383606, "timestamp": "2025-09-05 09:12:52.827558", "step": 3092, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:52.985898", "step": 3092, "epoch": 3 }, { "type": "loss", "content": 0.3345975875854492, "timestamp": "2025-09-05 09:12:52.988073", "step": 3093, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:53.151108", "step": 3093, "epoch": 3 }, { "type": "loss", "content": 0.31810086965560913, "timestamp": "2025-09-05 09:12:53.154398", "step": 3094, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:53.325100", "step": 3094, "epoch": 3 }, { "type": "loss", "content": 0.3396746814250946, "timestamp": "2025-09-05 09:12:53.327879", "step": 3095, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:53.492335", "step": 3095, "epoch": 3 }, { "type": "loss", "content": 0.319447785615921, "timestamp": "2025-09-05 09:12:53.508844", "step": 3096, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:53.675150", "step": 3096, "epoch": 3 }, { "type": "loss", "content": 0.422185480594635, "timestamp": "2025-09-05 09:12:53.677142", "step": 3097, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:12:53.845714", "step": 3097, "epoch": 3 }, { "type": "loss", "content": 0.24050793051719666, "timestamp": "2025-09-05 09:12:53.848556", "step": 3098, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:54.014849", "step": 3098, "epoch": 3 }, { "type": "loss", "content": 0.3138499855995178, "timestamp": "2025-09-05 09:12:54.017092", "step": 3099, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:12:54.180414", "step": 3099, "epoch": 3 }, { "type": "loss", "content": 0.20346899330615997, "timestamp": "2025-09-05 09:12:54.194225", "step": 3100, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:12:58.862549", "step": 3100, "epoch": 3 }, { "type": "pplx", "content": 54.427679569639615, "timestamp": "2025-09-05 09:12:58.865548", "step": 3100, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:59.033815", "step": 3100, "epoch": 3 }, { "type": "loss", "content": 0.30215319991111755, "timestamp": "2025-09-05 09:12:59.035886", "step": 3101, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:59.201630", "step": 3101, "epoch": 3 }, { "type": "loss", "content": 0.26254820823669434, "timestamp": "2025-09-05 09:12:59.203802", "step": 3102, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:12:59.409539", "step": 3102, "epoch": 3 }, { "type": "loss", "content": 0.32716652750968933, "timestamp": "2025-09-05 09:12:59.412056", "step": 3103, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:12:59.604460", "step": 3103, "epoch": 3 }, { "type": "loss", "content": 0.3003425896167755, "timestamp": "2025-09-05 09:12:59.618822", "step": 3104, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:12:59.806993", "step": 3104, "epoch": 3 }, { "type": "loss", "content": 0.2939733564853668, "timestamp": "2025-09-05 09:12:59.809008", "step": 3105, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:00.002760", "step": 3105, "epoch": 3 }, { "type": "loss", "content": 0.25660908222198486, "timestamp": "2025-09-05 09:13:00.004380", "step": 3106, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:00.208171", "step": 3106, "epoch": 3 }, { "type": "loss", "content": 0.23837247490882874, "timestamp": "2025-09-05 09:13:00.209793", "step": 3107, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:00.372238", "step": 3107, "epoch": 3 }, { "type": "loss", "content": 0.34475165605545044, "timestamp": "2025-09-05 09:13:00.388481", "step": 3108, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:00.580255", "step": 3108, "epoch": 3 }, { "type": "loss", "content": 0.22767935693264008, "timestamp": "2025-09-05 09:13:00.581944", "step": 3109, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:00.782539", "step": 3109, "epoch": 3 }, { "type": "loss", "content": 0.28474533557891846, "timestamp": "2025-09-05 09:13:00.784441", "step": 3110, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:00.949594", "step": 3110, "epoch": 3 }, { "type": "loss", "content": 0.19329321384429932, "timestamp": "2025-09-05 09:13:00.951484", "step": 3111, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:01.145321", "step": 3111, "epoch": 3 }, { "type": "loss", "content": 0.35781329870224, "timestamp": "2025-09-05 09:13:01.158501", "step": 3112, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:01.345180", "step": 3112, "epoch": 3 }, { "type": "loss", "content": 0.3530445694923401, "timestamp": "2025-09-05 09:13:01.346934", "step": 3113, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:01.540655", "step": 3113, "epoch": 3 }, { "type": "loss", "content": 0.40410447120666504, "timestamp": "2025-09-05 09:13:01.542300", "step": 3114, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:01.736198", "step": 3114, "epoch": 3 }, { "type": "loss", "content": 0.4058263301849365, "timestamp": "2025-09-05 09:13:01.737932", "step": 3115, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:01.930582", "step": 3115, "epoch": 3 }, { "type": "loss", "content": 0.27423328161239624, "timestamp": "2025-09-05 09:13:01.944971", "step": 3116, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:02.141696", "step": 3116, "epoch": 3 }, { "type": "loss", "content": 0.2607077956199646, "timestamp": "2025-09-05 09:13:02.143699", "step": 3117, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:02.340557", "step": 3117, "epoch": 3 }, { "type": "loss", "content": 0.42114734649658203, "timestamp": "2025-09-05 09:13:02.342579", "step": 3118, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:02.546126", "step": 3118, "epoch": 3 }, { "type": "loss", "content": 0.3532010316848755, "timestamp": "2025-09-05 09:13:02.548110", "step": 3119, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:02.742858", "step": 3119, "epoch": 3 }, { "type": "loss", "content": 0.23121574521064758, "timestamp": "2025-09-05 09:13:02.758242", "step": 3120, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:13:07.392319", "step": 3120, "epoch": 3 }, { "type": "pplx", "content": 55.011493179080475, "timestamp": "2025-09-05 09:13:07.394549", "step": 3120, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3120", "timestamp": "2025-09-05 09:13:07.852680", "step": 3120, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:08.021571", "step": 3120, "epoch": 3 }, { "type": "loss", "content": 0.3971679210662842, "timestamp": "2025-09-05 09:13:08.023605", "step": 3121, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:08.218220", "step": 3121, "epoch": 3 }, { "type": "loss", "content": 0.3160933256149292, "timestamp": "2025-09-05 09:13:08.220278", "step": 3122, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:08.423966", "step": 3122, "epoch": 3 }, { "type": "loss", "content": 0.31142422556877136, "timestamp": "2025-09-05 09:13:08.426571", "step": 3123, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:08.621738", "step": 3123, "epoch": 3 }, { "type": "loss", "content": 0.32644516229629517, "timestamp": "2025-09-05 09:13:08.637658", "step": 3124, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:08.833274", "step": 3124, "epoch": 3 }, { "type": "loss", "content": 0.30763083696365356, "timestamp": "2025-09-05 09:13:08.835053", "step": 3125, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:09.040750", "step": 3125, "epoch": 3 }, { "type": "loss", "content": 0.26925909519195557, "timestamp": "2025-09-05 09:13:09.042723", "step": 3126, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:09.237489", "step": 3126, "epoch": 3 }, { "type": "loss", "content": 0.3116042912006378, "timestamp": "2025-09-05 09:13:09.239615", "step": 3127, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:09.404887", "step": 3127, "epoch": 3 }, { "type": "loss", "content": 0.21474801003932953, "timestamp": "2025-09-05 09:13:09.421229", "step": 3128, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:09.619863", "step": 3128, "epoch": 3 }, { "type": "loss", "content": 0.26150596141815186, "timestamp": "2025-09-05 09:13:09.622785", "step": 3129, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:09.820954", "step": 3129, "epoch": 3 }, { "type": "loss", "content": 0.1961638182401657, "timestamp": "2025-09-05 09:13:09.823856", "step": 3130, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:10.020164", "step": 3130, "epoch": 3 }, { "type": "loss", "content": 0.40828442573547363, "timestamp": "2025-09-05 09:13:10.022062", "step": 3131, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:10.218948", "step": 3131, "epoch": 3 }, { "type": "loss", "content": 0.3171522915363312, "timestamp": "2025-09-05 09:13:10.232774", "step": 3132, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:10.419755", "step": 3132, "epoch": 3 }, { "type": "loss", "content": 0.32482245564460754, "timestamp": "2025-09-05 09:13:10.421934", "step": 3133, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:10.615055", "step": 3133, "epoch": 3 }, { "type": "loss", "content": 0.3538453280925751, "timestamp": "2025-09-05 09:13:10.617063", "step": 3134, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:10.815295", "step": 3134, "epoch": 3 }, { "type": "loss", "content": 0.24779179692268372, "timestamp": "2025-09-05 09:13:10.817327", "step": 3135, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:11.022062", "step": 3135, "epoch": 3 }, { "type": "loss", "content": 0.32560235261917114, "timestamp": "2025-09-05 09:13:11.035980", "step": 3136, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:11.226163", "step": 3136, "epoch": 3 }, { "type": "loss", "content": 0.21415184438228607, "timestamp": "2025-09-05 09:13:11.228008", "step": 3137, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:13:11.424295", "step": 3137, "epoch": 3 }, { "type": "loss", "content": 0.4218257665634155, "timestamp": "2025-09-05 09:13:11.426517", "step": 3138, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:11.622235", "step": 3138, "epoch": 3 }, { "type": "loss", "content": 0.23505598306655884, "timestamp": "2025-09-05 09:13:11.624260", "step": 3139, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:11.826337", "step": 3139, "epoch": 3 }, { "type": "loss", "content": 0.23702280223369598, "timestamp": "2025-09-05 09:13:11.842648", "step": 3140, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:13:16.493846", "step": 3140, "epoch": 3 }, { "type": "pplx", "content": 55.51095658927002, "timestamp": "2025-09-05 09:13:16.496136", "step": 3140, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:16.640097", "step": 3140, "epoch": 3 }, { "type": "loss", "content": 0.3143397867679596, "timestamp": "2025-09-05 09:13:16.642303", "step": 3141, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:16.807476", "step": 3141, "epoch": 3 }, { "type": "loss", "content": 0.3324390649795532, "timestamp": "2025-09-05 09:13:16.809313", "step": 3142, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:16.974136", "step": 3142, "epoch": 3 }, { "type": "loss", "content": 0.33971095085144043, "timestamp": "2025-09-05 09:13:16.975975", "step": 3143, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:17.141360", "step": 3143, "epoch": 3 }, { "type": "loss", "content": 0.19881880283355713, "timestamp": "2025-09-05 09:13:17.159763", "step": 3144, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:17.353448", "step": 3144, "epoch": 3 }, { "type": "loss", "content": 0.38393595814704895, "timestamp": "2025-09-05 09:13:17.356113", "step": 3145, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:17.534073", "step": 3145, "epoch": 3 }, { "type": "loss", "content": 0.2562781870365143, "timestamp": "2025-09-05 09:13:17.535976", "step": 3146, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:17.721573", "step": 3146, "epoch": 3 }, { "type": "loss", "content": 0.29539769887924194, "timestamp": "2025-09-05 09:13:17.724247", "step": 3147, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:13:17.893316", "step": 3147, "epoch": 3 }, { "type": "loss", "content": 0.3817562162876129, "timestamp": "2025-09-05 09:13:17.907641", "step": 3148, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:18.072599", "step": 3148, "epoch": 3 }, { "type": "loss", "content": 0.3453420102596283, "timestamp": "2025-09-05 09:13:18.074806", "step": 3149, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:18.210735", "step": 3149, "epoch": 3 }, { "type": "loss", "content": 0.4222257137298584, "timestamp": "2025-09-05 09:13:18.212520", "step": 3150, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:18.347726", "step": 3150, "epoch": 3 }, { "type": "loss", "content": 0.3828117847442627, "timestamp": "2025-09-05 09:13:18.349613", "step": 3151, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:18.522127", "step": 3151, "epoch": 3 }, { "type": "loss", "content": 0.17302794754505157, "timestamp": "2025-09-05 09:13:18.531119", "step": 3152, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:18.663926", "step": 3152, "epoch": 3 }, { "type": "loss", "content": 0.18933707475662231, "timestamp": "2025-09-05 09:13:18.665871", "step": 3153, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:13:18.799778", "step": 3153, "epoch": 3 }, { "type": "loss", "content": 0.15503232181072235, "timestamp": "2025-09-05 09:13:18.801802", "step": 3154, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:18.982215", "step": 3154, "epoch": 3 }, { "type": "loss", "content": 0.23069196939468384, "timestamp": "2025-09-05 09:13:18.984252", "step": 3155, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:19.148184", "step": 3155, "epoch": 3 }, { "type": "loss", "content": 0.2175375521183014, "timestamp": "2025-09-05 09:13:19.162405", "step": 3156, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:19.327253", "step": 3156, "epoch": 3 }, { "type": "loss", "content": 0.34681564569473267, "timestamp": "2025-09-05 09:13:19.329228", "step": 3157, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:19.465782", "step": 3157, "epoch": 3 }, { "type": "loss", "content": 0.21456918120384216, "timestamp": "2025-09-05 09:13:19.468315", "step": 3158, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:13:19.632293", "step": 3158, "epoch": 3 }, { "type": "loss", "content": 0.2504569888114929, "timestamp": "2025-09-05 09:13:19.634342", "step": 3159, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:19.797236", "step": 3159, "epoch": 3 }, { "type": "loss", "content": 0.2558198571205139, "timestamp": "2025-09-05 09:13:19.811715", "step": 3160, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:13:24.479387", "step": 3160, "epoch": 3 }, { "type": "pplx", "content": 55.59432534042321, "timestamp": "2025-09-05 09:13:24.481354", "step": 3160, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3160", "timestamp": "2025-09-05 09:13:24.938644", "step": 3160, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:25.078249", "step": 3160, "epoch": 3 }, { "type": "loss", "content": 0.22591379284858704, "timestamp": "2025-09-05 09:13:25.080397", "step": 3161, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:25.235416", "step": 3161, "epoch": 3 }, { "type": "loss", "content": 0.3258766233921051, "timestamp": "2025-09-05 09:13:25.237445", "step": 3162, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:25.393153", "step": 3162, "epoch": 3 }, { "type": "loss", "content": 0.20219656825065613, "timestamp": "2025-09-05 09:13:25.395273", "step": 3163, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:25.565963", "step": 3163, "epoch": 3 }, { "type": "loss", "content": 0.28959089517593384, "timestamp": "2025-09-05 09:13:25.580379", "step": 3164, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:25.734032", "step": 3164, "epoch": 3 }, { "type": "loss", "content": 0.4348662495613098, "timestamp": "2025-09-05 09:13:25.736065", "step": 3165, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:25.893585", "step": 3165, "epoch": 3 }, { "type": "loss", "content": 0.28287631273269653, "timestamp": "2025-09-05 09:13:25.895614", "step": 3166, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:26.053273", "step": 3166, "epoch": 3 }, { "type": "loss", "content": 0.24401001632213593, "timestamp": "2025-09-05 09:13:26.055349", "step": 3167, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:26.215424", "step": 3167, "epoch": 3 }, { "type": "loss", "content": 0.20977631211280823, "timestamp": "2025-09-05 09:13:26.229460", "step": 3168, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:26.381101", "step": 3168, "epoch": 3 }, { "type": "loss", "content": 0.2639927268028259, "timestamp": "2025-09-05 09:13:26.383275", "step": 3169, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:26.540967", "step": 3169, "epoch": 3 }, { "type": "loss", "content": 0.22925163805484772, "timestamp": "2025-09-05 09:13:26.543319", "step": 3170, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:26.700237", "step": 3170, "epoch": 3 }, { "type": "loss", "content": 0.2756401300430298, "timestamp": "2025-09-05 09:13:26.702363", "step": 3171, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:26.871120", "step": 3171, "epoch": 3 }, { "type": "loss", "content": 0.2692306935787201, "timestamp": "2025-09-05 09:13:26.885314", "step": 3172, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:27.045316", "step": 3172, "epoch": 3 }, { "type": "loss", "content": 0.29711222648620605, "timestamp": "2025-09-05 09:13:27.047747", "step": 3173, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:27.216850", "step": 3173, "epoch": 3 }, { "type": "loss", "content": 0.23208114504814148, "timestamp": "2025-09-05 09:13:27.218956", "step": 3174, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:27.377079", "step": 3174, "epoch": 3 }, { "type": "loss", "content": 0.38599449396133423, "timestamp": "2025-09-05 09:13:27.379006", "step": 3175, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:27.515279", "step": 3175, "epoch": 3 }, { "type": "loss", "content": 0.39971405267715454, "timestamp": "2025-09-05 09:13:27.531379", "step": 3176, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:27.691399", "step": 3176, "epoch": 3 }, { "type": "loss", "content": 0.19964629411697388, "timestamp": "2025-09-05 09:13:27.694855", "step": 3177, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:27.854835", "step": 3177, "epoch": 3 }, { "type": "loss", "content": 0.2849213778972626, "timestamp": "2025-09-05 09:13:27.857053", "step": 3178, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:28.014115", "step": 3178, "epoch": 3 }, { "type": "loss", "content": 0.3031027317047119, "timestamp": "2025-09-05 09:13:28.016142", "step": 3179, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:28.174919", "step": 3179, "epoch": 3 }, { "type": "loss", "content": 0.28292617201805115, "timestamp": "2025-09-05 09:13:28.188940", "step": 3180, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:13:32.818971", "step": 3180, "epoch": 3 }, { "type": "pplx", "content": 55.892830971609754, "timestamp": "2025-09-05 09:13:32.820927", "step": 3180, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:32.988141", "step": 3180, "epoch": 3 }, { "type": "loss", "content": 0.3160610496997833, "timestamp": "2025-09-05 09:13:32.989996", "step": 3181, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:33.192947", "step": 3181, "epoch": 3 }, { "type": "loss", "content": 0.39101895689964294, "timestamp": "2025-09-05 09:13:33.194897", "step": 3182, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:33.391519", "step": 3182, "epoch": 3 }, { "type": "loss", "content": 0.31756341457366943, "timestamp": "2025-09-05 09:13:33.393262", "step": 3183, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:33.598904", "step": 3183, "epoch": 3 }, { "type": "loss", "content": 0.38891616463661194, "timestamp": "2025-09-05 09:13:33.612305", "step": 3184, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:33.808124", "step": 3184, "epoch": 3 }, { "type": "loss", "content": 0.23683956265449524, "timestamp": "2025-09-05 09:13:33.810090", "step": 3185, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:34.005063", "step": 3185, "epoch": 3 }, { "type": "loss", "content": 0.2844909727573395, "timestamp": "2025-09-05 09:13:34.007312", "step": 3186, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:34.202313", "step": 3186, "epoch": 3 }, { "type": "loss", "content": 0.3205225467681885, "timestamp": "2025-09-05 09:13:34.204137", "step": 3187, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:34.406967", "step": 3187, "epoch": 3 }, { "type": "loss", "content": 0.2209499329328537, "timestamp": "2025-09-05 09:13:34.422971", "step": 3188, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:34.619317", "step": 3188, "epoch": 3 }, { "type": "loss", "content": 0.2793022096157074, "timestamp": "2025-09-05 09:13:34.621318", "step": 3189, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:34.817222", "step": 3189, "epoch": 3 }, { "type": "loss", "content": 0.2650418281555176, "timestamp": "2025-09-05 09:13:34.819152", "step": 3190, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:35.014115", "step": 3190, "epoch": 3 }, { "type": "loss", "content": 0.3632805347442627, "timestamp": "2025-09-05 09:13:35.016034", "step": 3191, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:35.210886", "step": 3191, "epoch": 3 }, { "type": "loss", "content": 0.1768227517604828, "timestamp": "2025-09-05 09:13:35.224422", "step": 3192, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:35.412596", "step": 3192, "epoch": 3 }, { "type": "loss", "content": 0.3283153176307678, "timestamp": "2025-09-05 09:13:35.414386", "step": 3193, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:35.618536", "step": 3193, "epoch": 3 }, { "type": "loss", "content": 0.2565184235572815, "timestamp": "2025-09-05 09:13:35.620431", "step": 3194, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:35.816892", "step": 3194, "epoch": 3 }, { "type": "loss", "content": 0.21749939024448395, "timestamp": "2025-09-05 09:13:35.819231", "step": 3195, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:36.022908", "step": 3195, "epoch": 3 }, { "type": "loss", "content": 0.20354753732681274, "timestamp": "2025-09-05 09:13:36.037137", "step": 3196, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:36.225881", "step": 3196, "epoch": 3 }, { "type": "loss", "content": 0.3543854355812073, "timestamp": "2025-09-05 09:13:36.227551", "step": 3197, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:36.422313", "step": 3197, "epoch": 3 }, { "type": "loss", "content": 0.3425697088241577, "timestamp": "2025-09-05 09:13:36.424066", "step": 3198, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:36.629872", "step": 3198, "epoch": 3 }, { "type": "loss", "content": 0.21106812357902527, "timestamp": "2025-09-05 09:13:36.632198", "step": 3199, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:36.837408", "step": 3199, "epoch": 3 }, { "type": "loss", "content": 0.23504656553268433, "timestamp": "2025-09-05 09:13:36.853502", "step": 3200, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:13:41.533859", "step": 3200, "epoch": 3 }, { "type": "pplx", "content": 55.82782568641178, "timestamp": "2025-09-05 09:13:41.536162", "step": 3200, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3200", "timestamp": "2025-09-05 09:13:42.000609", "step": 3200, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:13:42.185163", "step": 3200, "epoch": 3 }, { "type": "loss", "content": 0.2856009304523468, "timestamp": "2025-09-05 09:13:42.188048", "step": 3201, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:42.383871", "step": 3201, "epoch": 3 }, { "type": "loss", "content": 0.21880973875522614, "timestamp": "2025-09-05 09:13:42.385968", "step": 3202, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:42.584596", "step": 3202, "epoch": 3 }, { "type": "loss", "content": 0.3296593725681305, "timestamp": "2025-09-05 09:13:42.586488", "step": 3203, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:42.783648", "step": 3203, "epoch": 3 }, { "type": "loss", "content": 0.4413667321205139, "timestamp": "2025-09-05 09:13:42.799481", "step": 3204, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:13:42.996163", "step": 3204, "epoch": 3 }, { "type": "loss", "content": 0.25287240743637085, "timestamp": "2025-09-05 09:13:42.998732", "step": 3205, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:43.193562", "step": 3205, "epoch": 3 }, { "type": "loss", "content": 0.1394006609916687, "timestamp": "2025-09-05 09:13:43.195855", "step": 3206, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:43.392977", "step": 3206, "epoch": 3 }, { "type": "loss", "content": 0.3348866403102875, "timestamp": "2025-09-05 09:13:43.395309", "step": 3207, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:43.589743", "step": 3207, "epoch": 3 }, { "type": "loss", "content": 0.4713085889816284, "timestamp": "2025-09-05 09:13:43.603613", "step": 3208, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:43.795151", "step": 3208, "epoch": 3 }, { "type": "loss", "content": 0.3775988817214966, "timestamp": "2025-09-05 09:13:43.797101", "step": 3209, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:43.993547", "step": 3209, "epoch": 3 }, { "type": "loss", "content": 0.3147152066230774, "timestamp": "2025-09-05 09:13:43.995458", "step": 3210, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:44.199575", "step": 3210, "epoch": 3 }, { "type": "loss", "content": 0.3415358364582062, "timestamp": "2025-09-05 09:13:44.201472", "step": 3211, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:13:44.400141", "step": 3211, "epoch": 3 }, { "type": "loss", "content": 0.19705742597579956, "timestamp": "2025-09-05 09:13:44.414020", "step": 3212, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:44.603265", "step": 3212, "epoch": 3 }, { "type": "loss", "content": 0.4473814368247986, "timestamp": "2025-09-05 09:13:44.605283", "step": 3213, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:44.800855", "step": 3213, "epoch": 3 }, { "type": "loss", "content": 0.34939709305763245, "timestamp": "2025-09-05 09:13:44.802749", "step": 3214, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:44.998852", "step": 3214, "epoch": 3 }, { "type": "loss", "content": 0.39002692699432373, "timestamp": "2025-09-05 09:13:45.000884", "step": 3215, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:45.197315", "step": 3215, "epoch": 3 }, { "type": "loss", "content": 0.20467509329319, "timestamp": "2025-09-05 09:13:45.213118", "step": 3216, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:45.407524", "step": 3216, "epoch": 3 }, { "type": "loss", "content": 0.3030194044113159, "timestamp": "2025-09-05 09:13:45.409435", "step": 3217, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:45.604678", "step": 3217, "epoch": 3 }, { "type": "loss", "content": 0.27976304292678833, "timestamp": "2025-09-05 09:13:45.606660", "step": 3218, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:45.801975", "step": 3218, "epoch": 3 }, { "type": "loss", "content": 0.19097599387168884, "timestamp": "2025-09-05 09:13:45.804011", "step": 3219, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:45.999009", "step": 3219, "epoch": 3 }, { "type": "loss", "content": 0.314406156539917, "timestamp": "2025-09-05 09:13:46.013392", "step": 3220, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:13:50.632439", "step": 3220, "epoch": 3 }, { "type": "pplx", "content": 56.24416855105137, "timestamp": "2025-09-05 09:13:50.634131", "step": 3220, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:50.794930", "step": 3220, "epoch": 3 }, { "type": "loss", "content": 0.303940087556839, "timestamp": "2025-09-05 09:13:50.798304", "step": 3221, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:51.000115", "step": 3221, "epoch": 3 }, { "type": "loss", "content": 0.24126599729061127, "timestamp": "2025-09-05 09:13:51.002152", "step": 3222, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:51.166682", "step": 3222, "epoch": 3 }, { "type": "loss", "content": 0.2524254322052002, "timestamp": "2025-09-05 09:13:51.168699", "step": 3223, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:13:51.366062", "step": 3223, "epoch": 3 }, { "type": "loss", "content": 0.3770129382610321, "timestamp": "2025-09-05 09:13:51.380480", "step": 3224, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:51.568788", "step": 3224, "epoch": 3 }, { "type": "loss", "content": 0.3300042450428009, "timestamp": "2025-09-05 09:13:51.570467", "step": 3225, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:13:51.774578", "step": 3225, "epoch": 3 }, { "type": "loss", "content": 0.3154258728027344, "timestamp": "2025-09-05 09:13:51.776918", "step": 3226, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:13:51.980358", "step": 3226, "epoch": 3 }, { "type": "loss", "content": 0.14694538712501526, "timestamp": "2025-09-05 09:13:51.982512", "step": 3227, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:52.180855", "step": 3227, "epoch": 3 }, { "type": "loss", "content": 0.23194490373134613, "timestamp": "2025-09-05 09:13:52.195702", "step": 3228, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:52.384044", "step": 3228, "epoch": 3 }, { "type": "loss", "content": 0.34397757053375244, "timestamp": "2025-09-05 09:13:52.386127", "step": 3229, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:52.582752", "step": 3229, "epoch": 3 }, { "type": "loss", "content": 0.2911645472049713, "timestamp": "2025-09-05 09:13:52.584730", "step": 3230, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:52.779959", "step": 3230, "epoch": 3 }, { "type": "loss", "content": 0.4919881224632263, "timestamp": "2025-09-05 09:13:52.782040", "step": 3231, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:52.976859", "step": 3231, "epoch": 3 }, { "type": "loss", "content": 0.2355922907590866, "timestamp": "2025-09-05 09:13:52.992677", "step": 3232, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:13:53.187410", "step": 3232, "epoch": 3 }, { "type": "loss", "content": 0.36438658833503723, "timestamp": "2025-09-05 09:13:53.189659", "step": 3233, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:53.395388", "step": 3233, "epoch": 3 }, { "type": "loss", "content": 0.18304933607578278, "timestamp": "2025-09-05 09:13:53.398335", "step": 3234, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:53.597527", "step": 3234, "epoch": 3 }, { "type": "loss", "content": 0.27435052394866943, "timestamp": "2025-09-05 09:13:53.599413", "step": 3235, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:53.804343", "step": 3235, "epoch": 3 }, { "type": "loss", "content": 0.3234035074710846, "timestamp": "2025-09-05 09:13:53.818643", "step": 3236, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:54.009349", "step": 3236, "epoch": 3 }, { "type": "loss", "content": 0.44214802980422974, "timestamp": "2025-09-05 09:13:54.011547", "step": 3237, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:13:54.215298", "step": 3237, "epoch": 3 }, { "type": "loss", "content": 0.19979646801948547, "timestamp": "2025-09-05 09:13:54.217753", "step": 3238, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:54.422847", "step": 3238, "epoch": 3 }, { "type": "loss", "content": 0.24443750083446503, "timestamp": "2025-09-05 09:13:54.424945", "step": 3239, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:13:54.628788", "step": 3239, "epoch": 3 }, { "type": "loss", "content": 0.26840123534202576, "timestamp": "2025-09-05 09:13:54.641842", "step": 3240, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:13:59.277876", "step": 3240, "epoch": 3 }, { "type": "pplx", "content": 56.90161940514098, "timestamp": "2025-09-05 09:13:59.280406", "step": 3240, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3240", "timestamp": "2025-09-05 09:13:59.748306", "step": 3240, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:13:59.918039", "step": 3240, "epoch": 3 }, { "type": "loss", "content": 0.4399571120738983, "timestamp": "2025-09-05 09:13:59.920498", "step": 3241, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:00.124459", "step": 3241, "epoch": 3 }, { "type": "loss", "content": 0.26113492250442505, "timestamp": "2025-09-05 09:14:00.126551", "step": 3242, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:00.322918", "step": 3242, "epoch": 3 }, { "type": "loss", "content": 0.255985289812088, "timestamp": "2025-09-05 09:14:00.326082", "step": 3243, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:00.530352", "step": 3243, "epoch": 3 }, { "type": "loss", "content": 0.34716635942459106, "timestamp": "2025-09-05 09:14:00.544313", "step": 3244, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:00.732575", "step": 3244, "epoch": 3 }, { "type": "loss", "content": 0.24577327072620392, "timestamp": "2025-09-05 09:14:00.735048", "step": 3245, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:00.929746", "step": 3245, "epoch": 3 }, { "type": "loss", "content": 0.3293505907058716, "timestamp": "2025-09-05 09:14:00.931782", "step": 3246, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:01.127220", "step": 3246, "epoch": 3 }, { "type": "loss", "content": 0.3006509840488434, "timestamp": "2025-09-05 09:14:01.129383", "step": 3247, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:01.321083", "step": 3247, "epoch": 3 }, { "type": "loss", "content": 0.29109349846839905, "timestamp": "2025-09-05 09:14:01.334922", "step": 3248, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:01.531797", "step": 3248, "epoch": 3 }, { "type": "loss", "content": 0.19694913923740387, "timestamp": "2025-09-05 09:14:01.534299", "step": 3249, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:01.733757", "step": 3249, "epoch": 3 }, { "type": "loss", "content": 0.3834435045719147, "timestamp": "2025-09-05 09:14:01.735505", "step": 3250, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:01.938130", "step": 3250, "epoch": 3 }, { "type": "loss", "content": 0.23048020899295807, "timestamp": "2025-09-05 09:14:01.940204", "step": 3251, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:02.135458", "step": 3251, "epoch": 3 }, { "type": "loss", "content": 0.3646623492240906, "timestamp": "2025-09-05 09:14:02.150277", "step": 3252, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:14:02.339019", "step": 3252, "epoch": 3 }, { "type": "loss", "content": 0.3779314160346985, "timestamp": "2025-09-05 09:14:02.341680", "step": 3253, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:02.537969", "step": 3253, "epoch": 3 }, { "type": "loss", "content": 0.22319437563419342, "timestamp": "2025-09-05 09:14:02.540092", "step": 3254, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:02.737611", "step": 3254, "epoch": 3 }, { "type": "loss", "content": 0.3502451479434967, "timestamp": "2025-09-05 09:14:02.739738", "step": 3255, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:02.934475", "step": 3255, "epoch": 3 }, { "type": "loss", "content": 0.3203504681587219, "timestamp": "2025-09-05 09:14:02.950514", "step": 3256, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:03.146960", "step": 3256, "epoch": 3 }, { "type": "loss", "content": 0.2472524791955948, "timestamp": "2025-09-05 09:14:03.149188", "step": 3257, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:03.344244", "step": 3257, "epoch": 3 }, { "type": "loss", "content": 0.2531106770038605, "timestamp": "2025-09-05 09:14:03.346032", "step": 3258, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:03.552245", "step": 3258, "epoch": 3 }, { "type": "loss", "content": 0.4314362108707428, "timestamp": "2025-09-05 09:14:03.554754", "step": 3259, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:03.752049", "step": 3259, "epoch": 3 }, { "type": "loss", "content": 0.2161693572998047, "timestamp": "2025-09-05 09:14:03.765613", "step": 3260, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:14:08.486975", "step": 3260, "epoch": 3 }, { "type": "pplx", "content": 56.99661515636932, "timestamp": "2025-09-05 09:14:08.493687", "step": 3260, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:08.628509", "step": 3260, "epoch": 3 }, { "type": "loss", "content": 0.23275181651115417, "timestamp": "2025-09-05 09:14:08.630855", "step": 3261, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:08.793165", "step": 3261, "epoch": 3 }, { "type": "loss", "content": 0.4570353329181671, "timestamp": "2025-09-05 09:14:08.797479", "step": 3262, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:08.967300", "step": 3262, "epoch": 3 }, { "type": "loss", "content": 0.2490944266319275, "timestamp": "2025-09-05 09:14:08.976549", "step": 3263, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:09.140541", "step": 3263, "epoch": 3 }, { "type": "loss", "content": 0.28627151250839233, "timestamp": "2025-09-05 09:14:09.160239", "step": 3264, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:14:09.335688", "step": 3264, "epoch": 3 }, { "type": "loss", "content": 0.3446066677570343, "timestamp": "2025-09-05 09:14:09.338447", "step": 3265, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:09.519039", "step": 3265, "epoch": 3 }, { "type": "loss", "content": 0.2337915450334549, "timestamp": "2025-09-05 09:14:09.528948", "step": 3266, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:09.752462", "step": 3266, "epoch": 3 }, { "type": "loss", "content": 0.2857224941253662, "timestamp": "2025-09-05 09:14:09.756325", "step": 3267, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:09.992777", "step": 3267, "epoch": 3 }, { "type": "loss", "content": 0.306959867477417, "timestamp": "2025-09-05 09:14:10.021273", "step": 3268, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:10.249649", "step": 3268, "epoch": 3 }, { "type": "loss", "content": 0.2844340205192566, "timestamp": "2025-09-05 09:14:10.252493", "step": 3269, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:10.449235", "step": 3269, "epoch": 3 }, { "type": "loss", "content": 0.23272141814231873, "timestamp": "2025-09-05 09:14:10.452990", "step": 3270, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:10.647682", "step": 3270, "epoch": 3 }, { "type": "loss", "content": 0.2352897822856903, "timestamp": "2025-09-05 09:14:10.650540", "step": 3271, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:10.842725", "step": 3271, "epoch": 3 }, { "type": "loss", "content": 0.23539894819259644, "timestamp": "2025-09-05 09:14:10.855848", "step": 3272, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:11.043914", "step": 3272, "epoch": 3 }, { "type": "loss", "content": 0.27581384778022766, "timestamp": "2025-09-05 09:14:11.049234", "step": 3273, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:11.253580", "step": 3273, "epoch": 3 }, { "type": "loss", "content": 0.28692498803138733, "timestamp": "2025-09-05 09:14:11.255575", "step": 3274, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:11.459964", "step": 3274, "epoch": 3 }, { "type": "loss", "content": 0.2923082411289215, "timestamp": "2025-09-05 09:14:11.464675", "step": 3275, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:11.663448", "step": 3275, "epoch": 3 }, { "type": "loss", "content": 0.19943730533123016, "timestamp": "2025-09-05 09:14:11.678177", "step": 3276, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:11.867747", "step": 3276, "epoch": 3 }, { "type": "loss", "content": 0.33723267912864685, "timestamp": "2025-09-05 09:14:11.869804", "step": 3277, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:12.066298", "step": 3277, "epoch": 3 }, { "type": "loss", "content": 0.28404057025909424, "timestamp": "2025-09-05 09:14:12.068253", "step": 3278, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:12.270783", "step": 3278, "epoch": 3 }, { "type": "loss", "content": 0.4263806939125061, "timestamp": "2025-09-05 09:14:12.273001", "step": 3279, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:14:12.466169", "step": 3279, "epoch": 3 }, { "type": "loss", "content": 0.26628220081329346, "timestamp": "2025-09-05 09:14:12.480160", "step": 3280, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:14:17.120270", "step": 3280, "epoch": 3 }, { "type": "pplx", "content": 57.134940070215634, "timestamp": "2025-09-05 09:14:17.122321", "step": 3280, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3280", "timestamp": "2025-09-05 09:14:17.606762", "step": 3280, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:17.775913", "step": 3280, "epoch": 3 }, { "type": "loss", "content": 0.25822189450263977, "timestamp": "2025-09-05 09:14:17.778034", "step": 3281, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:17.972040", "step": 3281, "epoch": 3 }, { "type": "loss", "content": 0.3605766296386719, "timestamp": "2025-09-05 09:14:17.974036", "step": 3282, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:18.168615", "step": 3282, "epoch": 3 }, { "type": "loss", "content": 0.16432644426822662, "timestamp": "2025-09-05 09:14:18.170672", "step": 3283, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:18.337349", "step": 3283, "epoch": 3 }, { "type": "loss", "content": 0.36961498856544495, "timestamp": "2025-09-05 09:14:18.353700", "step": 3284, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:14:18.550693", "step": 3284, "epoch": 3 }, { "type": "loss", "content": 0.314778596162796, "timestamp": "2025-09-05 09:14:18.552728", "step": 3285, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:18.747988", "step": 3285, "epoch": 3 }, { "type": "loss", "content": 0.2686954736709595, "timestamp": "2025-09-05 09:14:18.750449", "step": 3286, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:14:18.914285", "step": 3286, "epoch": 3 }, { "type": "loss", "content": 0.21251261234283447, "timestamp": "2025-09-05 09:14:18.916301", "step": 3287, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:19.116973", "step": 3287, "epoch": 3 }, { "type": "loss", "content": 0.16383251547813416, "timestamp": "2025-09-05 09:14:19.126361", "step": 3288, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:19.286046", "step": 3288, "epoch": 3 }, { "type": "loss", "content": 0.2723452150821686, "timestamp": "2025-09-05 09:14:19.288450", "step": 3289, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:19.494176", "step": 3289, "epoch": 3 }, { "type": "loss", "content": 0.19908295571804047, "timestamp": "2025-09-05 09:14:19.495960", "step": 3290, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:19.699764", "step": 3290, "epoch": 3 }, { "type": "loss", "content": 0.3640027940273285, "timestamp": "2025-09-05 09:14:19.701833", "step": 3291, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:19.905047", "step": 3291, "epoch": 3 }, { "type": "loss", "content": 0.29716038703918457, "timestamp": "2025-09-05 09:14:19.921246", "step": 3292, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:20.117897", "step": 3292, "epoch": 3 }, { "type": "loss", "content": 0.23596565425395966, "timestamp": "2025-09-05 09:14:20.122259", "step": 3293, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:14:20.328057", "step": 3293, "epoch": 3 }, { "type": "loss", "content": 0.3065091371536255, "timestamp": "2025-09-05 09:14:20.330125", "step": 3294, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:20.525257", "step": 3294, "epoch": 3 }, { "type": "loss", "content": 0.29849687218666077, "timestamp": "2025-09-05 09:14:20.528075", "step": 3295, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:20.734713", "step": 3295, "epoch": 3 }, { "type": "loss", "content": 0.2795962393283844, "timestamp": "2025-09-05 09:14:20.748329", "step": 3296, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:20.936480", "step": 3296, "epoch": 3 }, { "type": "loss", "content": 0.32312214374542236, "timestamp": "2025-09-05 09:14:20.938447", "step": 3297, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:21.134729", "step": 3297, "epoch": 3 }, { "type": "loss", "content": 0.310803085565567, "timestamp": "2025-09-05 09:14:21.136747", "step": 3298, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:21.332732", "step": 3298, "epoch": 3 }, { "type": "loss", "content": 0.26474887132644653, "timestamp": "2025-09-05 09:14:21.335441", "step": 3299, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:21.501888", "step": 3299, "epoch": 3 }, { "type": "loss", "content": 0.25056520104408264, "timestamp": "2025-09-05 09:14:21.518681", "step": 3300, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:14:26.147488", "step": 3300, "epoch": 3 }, { "type": "pplx", "content": 57.88412892104688, "timestamp": "2025-09-05 09:14:26.149332", "step": 3300, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:26.309641", "step": 3300, "epoch": 3 }, { "type": "loss", "content": 0.24012216925621033, "timestamp": "2025-09-05 09:14:26.311702", "step": 3301, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:26.477497", "step": 3301, "epoch": 3 }, { "type": "loss", "content": 0.3001227378845215, "timestamp": "2025-09-05 09:14:26.479473", "step": 3302, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:26.684596", "step": 3302, "epoch": 3 }, { "type": "loss", "content": 0.39674386382102966, "timestamp": "2025-09-05 09:14:26.686953", "step": 3303, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:26.882026", "step": 3303, "epoch": 3 }, { "type": "loss", "content": 0.22072692215442657, "timestamp": "2025-09-05 09:14:26.895795", "step": 3304, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:27.083043", "step": 3304, "epoch": 3 }, { "type": "loss", "content": 0.2948031723499298, "timestamp": "2025-09-05 09:14:27.084945", "step": 3305, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:27.279441", "step": 3305, "epoch": 3 }, { "type": "loss", "content": 0.28666290640830994, "timestamp": "2025-09-05 09:14:27.281505", "step": 3306, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:27.476574", "step": 3306, "epoch": 3 }, { "type": "loss", "content": 0.3318125903606415, "timestamp": "2025-09-05 09:14:27.478516", "step": 3307, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:27.673538", "step": 3307, "epoch": 3 }, { "type": "loss", "content": 0.20074622333049774, "timestamp": "2025-09-05 09:14:27.687577", "step": 3308, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:27.875149", "step": 3308, "epoch": 3 }, { "type": "loss", "content": 0.34110990166664124, "timestamp": "2025-09-05 09:14:27.877311", "step": 3309, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:28.073214", "step": 3309, "epoch": 3 }, { "type": "loss", "content": 0.31203046441078186, "timestamp": "2025-09-05 09:14:28.075308", "step": 3310, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:14:28.270514", "step": 3310, "epoch": 3 }, { "type": "loss", "content": 0.3481146991252899, "timestamp": "2025-09-05 09:14:28.272785", "step": 3311, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:28.467367", "step": 3311, "epoch": 3 }, { "type": "loss", "content": 0.2804660201072693, "timestamp": "2025-09-05 09:14:28.481098", "step": 3312, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:28.669353", "step": 3312, "epoch": 3 }, { "type": "loss", "content": 0.30905982851982117, "timestamp": "2025-09-05 09:14:28.671337", "step": 3313, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:28.875523", "step": 3313, "epoch": 3 }, { "type": "loss", "content": 0.3836857080459595, "timestamp": "2025-09-05 09:14:28.877566", "step": 3314, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:29.072635", "step": 3314, "epoch": 3 }, { "type": "loss", "content": 0.20405101776123047, "timestamp": "2025-09-05 09:14:29.075571", "step": 3315, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:29.271910", "step": 3315, "epoch": 3 }, { "type": "loss", "content": 0.2559118866920471, "timestamp": "2025-09-05 09:14:29.285986", "step": 3316, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:29.473237", "step": 3316, "epoch": 3 }, { "type": "loss", "content": 0.24376524984836578, "timestamp": "2025-09-05 09:14:29.474965", "step": 3317, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:29.679559", "step": 3317, "epoch": 3 }, { "type": "loss", "content": 0.36361056566238403, "timestamp": "2025-09-05 09:14:29.681590", "step": 3318, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:29.877224", "step": 3318, "epoch": 3 }, { "type": "loss", "content": 0.21259449422359467, "timestamp": "2025-09-05 09:14:29.879560", "step": 3319, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:30.084812", "step": 3319, "epoch": 3 }, { "type": "loss", "content": 0.13015785813331604, "timestamp": "2025-09-05 09:14:30.097888", "step": 3320, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:14:34.729703", "step": 3320, "epoch": 3 }, { "type": "pplx", "content": 58.29189660131222, "timestamp": "2025-09-05 09:14:34.731755", "step": 3320, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3320", "timestamp": "2025-09-05 09:14:35.184580", "step": 3320, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:35.352584", "step": 3320, "epoch": 3 }, { "type": "loss", "content": 0.24831537902355194, "timestamp": "2025-09-05 09:14:35.354845", "step": 3321, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:35.521604", "step": 3321, "epoch": 3 }, { "type": "loss", "content": 0.3621161878108978, "timestamp": "2025-09-05 09:14:35.523647", "step": 3322, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:35.719190", "step": 3322, "epoch": 3 }, { "type": "loss", "content": 0.39665305614471436, "timestamp": "2025-09-05 09:14:35.720906", "step": 3323, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:35.914640", "step": 3323, "epoch": 3 }, { "type": "loss", "content": 0.2833714187145233, "timestamp": "2025-09-05 09:14:35.927820", "step": 3324, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:36.115552", "step": 3324, "epoch": 3 }, { "type": "loss", "content": 0.24043862521648407, "timestamp": "2025-09-05 09:14:36.117529", "step": 3325, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:36.322088", "step": 3325, "epoch": 3 }, { "type": "loss", "content": 0.30250465869903564, "timestamp": "2025-09-05 09:14:36.324460", "step": 3326, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:36.521003", "step": 3326, "epoch": 3 }, { "type": "loss", "content": 0.2354232370853424, "timestamp": "2025-09-05 09:14:36.523015", "step": 3327, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:36.716681", "step": 3327, "epoch": 3 }, { "type": "loss", "content": 0.2318793684244156, "timestamp": "2025-09-05 09:14:36.730420", "step": 3328, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:36.918065", "step": 3328, "epoch": 3 }, { "type": "loss", "content": 0.40657877922058105, "timestamp": "2025-09-05 09:14:36.920782", "step": 3329, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:37.116457", "step": 3329, "epoch": 3 }, { "type": "loss", "content": 0.33501723408699036, "timestamp": "2025-09-05 09:14:37.122192", "step": 3330, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:37.288149", "step": 3330, "epoch": 3 }, { "type": "loss", "content": 0.3885265290737152, "timestamp": "2025-09-05 09:14:37.290154", "step": 3331, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:37.484428", "step": 3331, "epoch": 3 }, { "type": "loss", "content": 0.2885796129703522, "timestamp": "2025-09-05 09:14:37.498324", "step": 3332, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:37.693667", "step": 3332, "epoch": 3 }, { "type": "loss", "content": 0.20705479383468628, "timestamp": "2025-09-05 09:14:37.695563", "step": 3333, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:37.891306", "step": 3333, "epoch": 3 }, { "type": "loss", "content": 0.35460859537124634, "timestamp": "2025-09-05 09:14:37.893113", "step": 3334, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:14:38.056266", "step": 3334, "epoch": 3 }, { "type": "loss", "content": 0.2885071039199829, "timestamp": "2025-09-05 09:14:38.058140", "step": 3335, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:38.262325", "step": 3335, "epoch": 3 }, { "type": "loss", "content": 0.31416335701942444, "timestamp": "2025-09-05 09:14:38.276155", "step": 3336, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:38.465017", "step": 3336, "epoch": 3 }, { "type": "loss", "content": 0.3768198788166046, "timestamp": "2025-09-05 09:14:38.467262", "step": 3337, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:38.662768", "step": 3337, "epoch": 3 }, { "type": "loss", "content": 0.20448677241802216, "timestamp": "2025-09-05 09:14:38.665904", "step": 3338, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:38.867273", "step": 3338, "epoch": 3 }, { "type": "loss", "content": 0.18872617185115814, "timestamp": "2025-09-05 09:14:38.869675", "step": 3339, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:39.064688", "step": 3339, "epoch": 3 }, { "type": "loss", "content": 0.2098010927438736, "timestamp": "2025-09-05 09:14:39.077793", "step": 3340, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:14:43.699963", "step": 3340, "epoch": 3 }, { "type": "pplx", "content": 57.79616724128643, "timestamp": "2025-09-05 09:14:43.702238", "step": 3340, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:43.862535", "step": 3340, "epoch": 3 }, { "type": "loss", "content": 0.1867901235818863, "timestamp": "2025-09-05 09:14:43.864350", "step": 3341, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:44.027936", "step": 3341, "epoch": 3 }, { "type": "loss", "content": 0.221108078956604, "timestamp": "2025-09-05 09:14:44.030146", "step": 3342, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:44.231856", "step": 3342, "epoch": 3 }, { "type": "loss", "content": 0.34810617566108704, "timestamp": "2025-09-05 09:14:44.234406", "step": 3343, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:44.431426", "step": 3343, "epoch": 3 }, { "type": "loss", "content": 0.2502887547016144, "timestamp": "2025-09-05 09:14:44.445148", "step": 3344, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:14:44.633890", "step": 3344, "epoch": 3 }, { "type": "loss", "content": 0.29671764373779297, "timestamp": "2025-09-05 09:14:44.635930", "step": 3345, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:44.827997", "step": 3345, "epoch": 3 }, { "type": "loss", "content": 0.37294748425483704, "timestamp": "2025-09-05 09:14:44.831072", "step": 3346, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:45.025566", "step": 3346, "epoch": 3 }, { "type": "loss", "content": 0.1889505237340927, "timestamp": "2025-09-05 09:14:45.028729", "step": 3347, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:45.221524", "step": 3347, "epoch": 3 }, { "type": "loss", "content": 0.3558424711227417, "timestamp": "2025-09-05 09:14:45.236034", "step": 3348, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:45.418501", "step": 3348, "epoch": 3 }, { "type": "loss", "content": 0.334084689617157, "timestamp": "2025-09-05 09:14:45.421506", "step": 3349, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:45.616166", "step": 3349, "epoch": 3 }, { "type": "loss", "content": 0.3016354739665985, "timestamp": "2025-09-05 09:14:45.618253", "step": 3350, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:45.812162", "step": 3350, "epoch": 3 }, { "type": "loss", "content": 0.32526397705078125, "timestamp": "2025-09-05 09:14:45.814521", "step": 3351, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:46.010569", "step": 3351, "epoch": 3 }, { "type": "loss", "content": 0.3452492952346802, "timestamp": "2025-09-05 09:14:46.025210", "step": 3352, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:46.214549", "step": 3352, "epoch": 3 }, { "type": "loss", "content": 0.2737857401371002, "timestamp": "2025-09-05 09:14:46.216639", "step": 3353, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:46.421245", "step": 3353, "epoch": 3 }, { "type": "loss", "content": 0.28672438859939575, "timestamp": "2025-09-05 09:14:46.423862", "step": 3354, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:46.619848", "step": 3354, "epoch": 3 }, { "type": "loss", "content": 0.2686102092266083, "timestamp": "2025-09-05 09:14:46.622311", "step": 3355, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:46.817721", "step": 3355, "epoch": 3 }, { "type": "loss", "content": 0.5274847745895386, "timestamp": "2025-09-05 09:14:46.831979", "step": 3356, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:47.020822", "step": 3356, "epoch": 3 }, { "type": "loss", "content": 0.3943261206150055, "timestamp": "2025-09-05 09:14:47.022852", "step": 3357, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:47.226539", "step": 3357, "epoch": 3 }, { "type": "loss", "content": 0.29574286937713623, "timestamp": "2025-09-05 09:14:47.228426", "step": 3358, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:47.423390", "step": 3358, "epoch": 3 }, { "type": "loss", "content": 0.22139273583889008, "timestamp": "2025-09-05 09:14:47.425194", "step": 3359, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:47.630502", "step": 3359, "epoch": 3 }, { "type": "loss", "content": 0.22906312346458435, "timestamp": "2025-09-05 09:14:47.643775", "step": 3360, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:14:52.285789", "step": 3360, "epoch": 3 }, { "type": "pplx", "content": 56.80259232058196, "timestamp": "2025-09-05 09:14:52.287807", "step": 3360, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3360", "timestamp": "2025-09-05 09:14:52.754322", "step": 3360, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:52.922250", "step": 3360, "epoch": 3 }, { "type": "loss", "content": 0.2129780501127243, "timestamp": "2025-09-05 09:14:52.924346", "step": 3361, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:53.126998", "step": 3361, "epoch": 3 }, { "type": "loss", "content": 0.2788529396057129, "timestamp": "2025-09-05 09:14:53.129039", "step": 3362, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:53.323035", "step": 3362, "epoch": 3 }, { "type": "loss", "content": 0.1642555296421051, "timestamp": "2025-09-05 09:14:53.325772", "step": 3363, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:53.529352", "step": 3363, "epoch": 3 }, { "type": "loss", "content": 0.2505939304828644, "timestamp": "2025-09-05 09:14:53.544030", "step": 3364, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:53.733373", "step": 3364, "epoch": 3 }, { "type": "loss", "content": 0.3860357403755188, "timestamp": "2025-09-05 09:14:53.735306", "step": 3365, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:53.902689", "step": 3365, "epoch": 3 }, { "type": "loss", "content": 0.2575131058692932, "timestamp": "2025-09-05 09:14:53.904636", "step": 3366, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:54.111034", "step": 3366, "epoch": 3 }, { "type": "loss", "content": 0.2468288540840149, "timestamp": "2025-09-05 09:14:54.113207", "step": 3367, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:54.280129", "step": 3367, "epoch": 3 }, { "type": "loss", "content": 0.21068817377090454, "timestamp": "2025-09-05 09:14:54.296230", "step": 3368, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:54.490215", "step": 3368, "epoch": 3 }, { "type": "loss", "content": 0.3015161454677582, "timestamp": "2025-09-05 09:14:54.492249", "step": 3369, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:54.695993", "step": 3369, "epoch": 3 }, { "type": "loss", "content": 0.32386377453804016, "timestamp": "2025-09-05 09:14:54.698406", "step": 3370, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:14:54.905151", "step": 3370, "epoch": 3 }, { "type": "loss", "content": 0.3242306709289551, "timestamp": "2025-09-05 09:14:54.910037", "step": 3371, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:55.127944", "step": 3371, "epoch": 3 }, { "type": "loss", "content": 0.3697621524333954, "timestamp": "2025-09-05 09:14:55.142607", "step": 3372, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:14:55.330924", "step": 3372, "epoch": 3 }, { "type": "loss", "content": 0.2763964831829071, "timestamp": "2025-09-05 09:14:55.332840", "step": 3373, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:55.536582", "step": 3373, "epoch": 3 }, { "type": "loss", "content": 0.237064391374588, "timestamp": "2025-09-05 09:14:55.538791", "step": 3374, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:55.744391", "step": 3374, "epoch": 3 }, { "type": "loss", "content": 0.3390370309352875, "timestamp": "2025-09-05 09:14:55.746411", "step": 3375, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:55.941059", "step": 3375, "epoch": 3 }, { "type": "loss", "content": 0.3362955152988434, "timestamp": "2025-09-05 09:14:55.954866", "step": 3376, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:14:56.144253", "step": 3376, "epoch": 3 }, { "type": "loss", "content": 0.3102271854877472, "timestamp": "2025-09-05 09:14:56.146687", "step": 3377, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:14:56.349031", "step": 3377, "epoch": 3 }, { "type": "loss", "content": 0.2332654595375061, "timestamp": "2025-09-05 09:14:56.351094", "step": 3378, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:14:56.546133", "step": 3378, "epoch": 3 }, { "type": "loss", "content": 0.297242134809494, "timestamp": "2025-09-05 09:14:56.548284", "step": 3379, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:14:56.743311", "step": 3379, "epoch": 3 }, { "type": "loss", "content": 0.3286650478839874, "timestamp": "2025-09-05 09:14:56.756242", "step": 3380, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:15:01.458280", "step": 3380, "epoch": 3 }, { "type": "pplx", "content": 57.62159304092415, "timestamp": "2025-09-05 09:15:01.460429", "step": 3380, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:01.620078", "step": 3380, "epoch": 3 }, { "type": "loss", "content": 0.35747459530830383, "timestamp": "2025-09-05 09:15:01.622312", "step": 3381, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:01.788171", "step": 3381, "epoch": 3 }, { "type": "loss", "content": 0.21469105780124664, "timestamp": "2025-09-05 09:15:01.789982", "step": 3382, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:01.964218", "step": 3382, "epoch": 3 }, { "type": "loss", "content": 0.23298673331737518, "timestamp": "2025-09-05 09:15:01.966641", "step": 3383, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:02.175542", "step": 3383, "epoch": 3 }, { "type": "loss", "content": 0.20766644179821014, "timestamp": "2025-09-05 09:15:02.191896", "step": 3384, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:02.389069", "step": 3384, "epoch": 3 }, { "type": "loss", "content": 0.3102535009384155, "timestamp": "2025-09-05 09:15:02.391536", "step": 3385, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:02.588812", "step": 3385, "epoch": 3 }, { "type": "loss", "content": 0.23322685062885284, "timestamp": "2025-09-05 09:15:02.591193", "step": 3386, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:02.787088", "step": 3386, "epoch": 3 }, { "type": "loss", "content": 0.2468855381011963, "timestamp": "2025-09-05 09:15:02.789001", "step": 3387, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:02.984473", "step": 3387, "epoch": 3 }, { "type": "loss", "content": 0.34562644362449646, "timestamp": "2025-09-05 09:15:02.998435", "step": 3388, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:03.188465", "step": 3388, "epoch": 3 }, { "type": "loss", "content": 0.27134451270103455, "timestamp": "2025-09-05 09:15:03.190918", "step": 3389, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:03.395100", "step": 3389, "epoch": 3 }, { "type": "loss", "content": 0.28415605425834656, "timestamp": "2025-09-05 09:15:03.397109", "step": 3390, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:03.561477", "step": 3390, "epoch": 3 }, { "type": "loss", "content": 0.24175278842449188, "timestamp": "2025-09-05 09:15:03.563253", "step": 3391, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:03.768969", "step": 3391, "epoch": 3 }, { "type": "loss", "content": 0.2102387249469757, "timestamp": "2025-09-05 09:15:03.786745", "step": 3392, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:03.983685", "step": 3392, "epoch": 3 }, { "type": "loss", "content": 0.32016080617904663, "timestamp": "2025-09-05 09:15:03.986311", "step": 3393, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:04.181398", "step": 3393, "epoch": 3 }, { "type": "loss", "content": 0.20062977075576782, "timestamp": "2025-09-05 09:15:04.183780", "step": 3394, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:04.387838", "step": 3394, "epoch": 3 }, { "type": "loss", "content": 0.3682517409324646, "timestamp": "2025-09-05 09:15:04.389851", "step": 3395, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:04.553368", "step": 3395, "epoch": 3 }, { "type": "loss", "content": 0.341085284948349, "timestamp": "2025-09-05 09:15:04.569588", "step": 3396, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:15:04.765544", "step": 3396, "epoch": 3 }, { "type": "loss", "content": 0.32158076763153076, "timestamp": "2025-09-05 09:15:04.767464", "step": 3397, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:04.932611", "step": 3397, "epoch": 3 }, { "type": "loss", "content": 0.25980469584465027, "timestamp": "2025-09-05 09:15:04.934471", "step": 3398, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:05.139083", "step": 3398, "epoch": 3 }, { "type": "loss", "content": 0.2603665292263031, "timestamp": "2025-09-05 09:15:05.140946", "step": 3399, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:05.305222", "step": 3399, "epoch": 3 }, { "type": "loss", "content": 0.3023158311843872, "timestamp": "2025-09-05 09:15:05.324842", "step": 3400, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:15:10.151502", "step": 3400, "epoch": 3 }, { "type": "pplx", "content": 57.15140966805781, "timestamp": "2025-09-05 09:15:10.153894", "step": 3400, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3400", "timestamp": "2025-09-05 09:15:10.623749", "step": 3400, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:15:10.790882", "step": 3400, "epoch": 3 }, { "type": "loss", "content": 0.2876795530319214, "timestamp": "2025-09-05 09:15:10.793809", "step": 3401, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:10.959766", "step": 3401, "epoch": 3 }, { "type": "loss", "content": 0.20607417821884155, "timestamp": "2025-09-05 09:15:10.962061", "step": 3402, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:11.168326", "step": 3402, "epoch": 3 }, { "type": "loss", "content": 0.2529613673686981, "timestamp": "2025-09-05 09:15:11.170402", "step": 3403, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:11.366376", "step": 3403, "epoch": 3 }, { "type": "loss", "content": 0.2774178385734558, "timestamp": "2025-09-05 09:15:11.380522", "step": 3404, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:11.570006", "step": 3404, "epoch": 3 }, { "type": "loss", "content": 0.23143890500068665, "timestamp": "2025-09-05 09:15:11.572036", "step": 3405, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:11.767214", "step": 3405, "epoch": 3 }, { "type": "loss", "content": 0.2894435226917267, "timestamp": "2025-09-05 09:15:11.769237", "step": 3406, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:11.934854", "step": 3406, "epoch": 3 }, { "type": "loss", "content": 0.28832849860191345, "timestamp": "2025-09-05 09:15:11.937495", "step": 3407, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:12.141320", "step": 3407, "epoch": 3 }, { "type": "loss", "content": 0.3540599048137665, "timestamp": "2025-09-05 09:15:12.157930", "step": 3408, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:12.356207", "step": 3408, "epoch": 3 }, { "type": "loss", "content": 0.4962592124938965, "timestamp": "2025-09-05 09:15:12.359073", "step": 3409, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:12.562077", "step": 3409, "epoch": 3 }, { "type": "loss", "content": 0.3610974848270416, "timestamp": "2025-09-05 09:15:12.564088", "step": 3410, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:12.760083", "step": 3410, "epoch": 3 }, { "type": "loss", "content": 0.238719180226326, "timestamp": "2025-09-05 09:15:12.762436", "step": 3411, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:12.958223", "step": 3411, "epoch": 3 }, { "type": "loss", "content": 0.3288671374320984, "timestamp": "2025-09-05 09:15:12.972265", "step": 3412, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:13.169674", "step": 3412, "epoch": 3 }, { "type": "loss", "content": 0.32551729679107666, "timestamp": "2025-09-05 09:15:13.172045", "step": 3413, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:13.378475", "step": 3413, "epoch": 3 }, { "type": "loss", "content": 0.24862384796142578, "timestamp": "2025-09-05 09:15:13.380797", "step": 3414, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:13.576666", "step": 3414, "epoch": 3 }, { "type": "loss", "content": 0.28981438279151917, "timestamp": "2025-09-05 09:15:13.578607", "step": 3415, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:13.783393", "step": 3415, "epoch": 3 }, { "type": "loss", "content": 0.2465643584728241, "timestamp": "2025-09-05 09:15:13.797643", "step": 3416, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:13.990010", "step": 3416, "epoch": 3 }, { "type": "loss", "content": 0.36424243450164795, "timestamp": "2025-09-05 09:15:13.992100", "step": 3417, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:14.188052", "step": 3417, "epoch": 3 }, { "type": "loss", "content": 0.2606428861618042, "timestamp": "2025-09-05 09:15:14.190080", "step": 3418, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:14.355441", "step": 3418, "epoch": 3 }, { "type": "loss", "content": 0.335843950510025, "timestamp": "2025-09-05 09:15:14.357952", "step": 3419, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:14.563346", "step": 3419, "epoch": 3 }, { "type": "loss", "content": 0.3763747215270996, "timestamp": "2025-09-05 09:15:14.577993", "step": 3420, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:15:19.236496", "step": 3420, "epoch": 3 }, { "type": "pplx", "content": 56.271628643405734, "timestamp": "2025-09-05 09:15:19.238464", "step": 3420, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:19.398491", "step": 3420, "epoch": 3 }, { "type": "loss", "content": 0.26849332451820374, "timestamp": "2025-09-05 09:15:19.400593", "step": 3421, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:19.565877", "step": 3421, "epoch": 3 }, { "type": "loss", "content": 0.2555030584335327, "timestamp": "2025-09-05 09:15:19.568109", "step": 3422, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:19.770532", "step": 3422, "epoch": 3 }, { "type": "loss", "content": 0.3213971257209778, "timestamp": "2025-09-05 09:15:19.772369", "step": 3423, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:19.968537", "step": 3423, "epoch": 3 }, { "type": "loss", "content": 0.2824600338935852, "timestamp": "2025-09-05 09:15:19.982086", "step": 3424, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:20.169982", "step": 3424, "epoch": 3 }, { "type": "loss", "content": 0.2855144739151001, "timestamp": "2025-09-05 09:15:20.172138", "step": 3425, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:20.366577", "step": 3425, "epoch": 3 }, { "type": "loss", "content": 0.3020437955856323, "timestamp": "2025-09-05 09:15:20.368634", "step": 3426, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:20.564048", "step": 3426, "epoch": 3 }, { "type": "loss", "content": 0.26419728994369507, "timestamp": "2025-09-05 09:15:20.566245", "step": 3427, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:20.762084", "step": 3427, "epoch": 3 }, { "type": "loss", "content": 0.3464629054069519, "timestamp": "2025-09-05 09:15:20.776139", "step": 3428, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:20.971258", "step": 3428, "epoch": 3 }, { "type": "loss", "content": 0.2719259560108185, "timestamp": "2025-09-05 09:15:20.973178", "step": 3429, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:21.177875", "step": 3429, "epoch": 3 }, { "type": "loss", "content": 0.29615768790245056, "timestamp": "2025-09-05 09:15:21.179698", "step": 3430, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:21.374975", "step": 3430, "epoch": 3 }, { "type": "loss", "content": 0.1992407739162445, "timestamp": "2025-09-05 09:15:21.377643", "step": 3431, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:21.573803", "step": 3431, "epoch": 3 }, { "type": "loss", "content": 0.23724502325057983, "timestamp": "2025-09-05 09:15:21.587778", "step": 3432, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:21.774715", "step": 3432, "epoch": 3 }, { "type": "loss", "content": 0.3083495795726776, "timestamp": "2025-09-05 09:15:21.776739", "step": 3433, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:21.980420", "step": 3433, "epoch": 3 }, { "type": "loss", "content": 0.3451620638370514, "timestamp": "2025-09-05 09:15:21.982583", "step": 3434, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:22.179465", "step": 3434, "epoch": 3 }, { "type": "loss", "content": 0.3891802728176117, "timestamp": "2025-09-05 09:15:22.181453", "step": 3435, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:22.385405", "step": 3435, "epoch": 3 }, { "type": "loss", "content": 0.3739110231399536, "timestamp": "2025-09-05 09:15:22.399825", "step": 3436, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:22.595466", "step": 3436, "epoch": 3 }, { "type": "loss", "content": 0.2214849293231964, "timestamp": "2025-09-05 09:15:22.600093", "step": 3437, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:22.767477", "step": 3437, "epoch": 3 }, { "type": "loss", "content": 0.36019426584243774, "timestamp": "2025-09-05 09:15:22.775452", "step": 3438, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:22.982396", "step": 3438, "epoch": 3 }, { "type": "loss", "content": 0.1959642767906189, "timestamp": "2025-09-05 09:15:22.984052", "step": 3439, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:15:23.180164", "step": 3439, "epoch": 3 }, { "type": "loss", "content": 0.30671948194503784, "timestamp": "2025-09-05 09:15:23.194420", "step": 3440, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:15:27.842056", "step": 3440, "epoch": 3 }, { "type": "pplx", "content": 56.487056186244445, "timestamp": "2025-09-05 09:15:27.844216", "step": 3440, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3440", "timestamp": "2025-09-05 09:15:28.308530", "step": 3440, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:28.469196", "step": 3440, "epoch": 3 }, { "type": "loss", "content": 0.4308766722679138, "timestamp": "2025-09-05 09:15:28.471395", "step": 3441, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:28.672848", "step": 3441, "epoch": 3 }, { "type": "loss", "content": 0.2569759488105774, "timestamp": "2025-09-05 09:15:28.675252", "step": 3442, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:28.871485", "step": 3442, "epoch": 3 }, { "type": "loss", "content": 0.39308369159698486, "timestamp": "2025-09-05 09:15:28.873593", "step": 3443, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:29.067035", "step": 3443, "epoch": 3 }, { "type": "loss", "content": 0.36607518792152405, "timestamp": "2025-09-05 09:15:29.080686", "step": 3444, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:29.266327", "step": 3444, "epoch": 3 }, { "type": "loss", "content": 0.3157901167869568, "timestamp": "2025-09-05 09:15:29.268664", "step": 3445, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:29.463369", "step": 3445, "epoch": 3 }, { "type": "loss", "content": 0.32073408365249634, "timestamp": "2025-09-05 09:15:29.465382", "step": 3446, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:29.668771", "step": 3446, "epoch": 3 }, { "type": "loss", "content": 0.21521615982055664, "timestamp": "2025-09-05 09:15:29.670848", "step": 3447, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:29.873604", "step": 3447, "epoch": 3 }, { "type": "loss", "content": 0.24196557700634003, "timestamp": "2025-09-05 09:15:29.887959", "step": 3448, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:30.082517", "step": 3448, "epoch": 3 }, { "type": "loss", "content": 0.34385359287261963, "timestamp": "2025-09-05 09:15:30.084676", "step": 3449, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:30.279879", "step": 3449, "epoch": 3 }, { "type": "loss", "content": 0.2406640201807022, "timestamp": "2025-09-05 09:15:30.282237", "step": 3450, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:30.475842", "step": 3450, "epoch": 3 }, { "type": "loss", "content": 0.18529509007930756, "timestamp": "2025-09-05 09:15:30.477976", "step": 3451, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:30.672260", "step": 3451, "epoch": 3 }, { "type": "loss", "content": 0.3604893088340759, "timestamp": "2025-09-05 09:15:30.688470", "step": 3452, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:30.883472", "step": 3452, "epoch": 3 }, { "type": "loss", "content": 0.3433663249015808, "timestamp": "2025-09-05 09:15:30.885759", "step": 3453, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:31.079225", "step": 3453, "epoch": 3 }, { "type": "loss", "content": 0.2779248058795929, "timestamp": "2025-09-05 09:15:31.082324", "step": 3454, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:31.275564", "step": 3454, "epoch": 3 }, { "type": "loss", "content": 0.31973740458488464, "timestamp": "2025-09-05 09:15:31.277984", "step": 3455, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:31.471886", "step": 3455, "epoch": 3 }, { "type": "loss", "content": 0.349806547164917, "timestamp": "2025-09-05 09:15:31.486066", "step": 3456, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:31.676963", "step": 3456, "epoch": 3 }, { "type": "loss", "content": 0.1699691265821457, "timestamp": "2025-09-05 09:15:31.678853", "step": 3457, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:31.871846", "step": 3457, "epoch": 3 }, { "type": "loss", "content": 0.4177989065647125, "timestamp": "2025-09-05 09:15:31.883300", "step": 3458, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:32.078448", "step": 3458, "epoch": 3 }, { "type": "loss", "content": 0.24496307969093323, "timestamp": "2025-09-05 09:15:32.080765", "step": 3459, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:32.284260", "step": 3459, "epoch": 3 }, { "type": "loss", "content": 0.24864445626735687, "timestamp": "2025-09-05 09:15:32.298058", "step": 3460, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:15:36.949048", "step": 3460, "epoch": 3 }, { "type": "pplx", "content": 56.84112110557716, "timestamp": "2025-09-05 09:15:36.951078", "step": 3460, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:37.108299", "step": 3460, "epoch": 3 }, { "type": "loss", "content": 0.33684948086738586, "timestamp": "2025-09-05 09:15:37.111450", "step": 3461, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:37.276196", "step": 3461, "epoch": 3 }, { "type": "loss", "content": 0.371894896030426, "timestamp": "2025-09-05 09:15:37.278182", "step": 3462, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:37.481793", "step": 3462, "epoch": 3 }, { "type": "loss", "content": 0.41939064860343933, "timestamp": "2025-09-05 09:15:37.483876", "step": 3463, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:37.678809", "step": 3463, "epoch": 3 }, { "type": "loss", "content": 0.16646873950958252, "timestamp": "2025-09-05 09:15:37.692365", "step": 3464, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:37.887252", "step": 3464, "epoch": 3 }, { "type": "loss", "content": 0.27085840702056885, "timestamp": "2025-09-05 09:15:37.889497", "step": 3465, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:38.085222", "step": 3465, "epoch": 3 }, { "type": "loss", "content": 0.3408336639404297, "timestamp": "2025-09-05 09:15:38.087272", "step": 3466, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:38.288038", "step": 3466, "epoch": 3 }, { "type": "loss", "content": 0.23875372111797333, "timestamp": "2025-09-05 09:15:38.290372", "step": 3467, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:38.484252", "step": 3467, "epoch": 3 }, { "type": "loss", "content": 0.28153443336486816, "timestamp": "2025-09-05 09:15:38.498227", "step": 3468, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:38.686591", "step": 3468, "epoch": 3 }, { "type": "loss", "content": 0.3948226869106293, "timestamp": "2025-09-05 09:15:38.689339", "step": 3469, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:38.883108", "step": 3469, "epoch": 3 }, { "type": "loss", "content": 0.253915011882782, "timestamp": "2025-09-05 09:15:38.885040", "step": 3470, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:39.079939", "step": 3470, "epoch": 3 }, { "type": "loss", "content": 0.2193460613489151, "timestamp": "2025-09-05 09:15:39.081811", "step": 3471, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:39.276867", "step": 3471, "epoch": 3 }, { "type": "loss", "content": 0.19324208796024323, "timestamp": "2025-09-05 09:15:39.291080", "step": 3472, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:39.478061", "step": 3472, "epoch": 3 }, { "type": "loss", "content": 0.3039948344230652, "timestamp": "2025-09-05 09:15:39.479952", "step": 3473, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:39.675488", "step": 3473, "epoch": 3 }, { "type": "loss", "content": 0.36532217264175415, "timestamp": "2025-09-05 09:15:39.677479", "step": 3474, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:15:39.878087", "step": 3474, "epoch": 3 }, { "type": "loss", "content": 0.2959711253643036, "timestamp": "2025-09-05 09:15:39.879967", "step": 3475, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:40.075382", "step": 3475, "epoch": 3 }, { "type": "loss", "content": 0.44074317812919617, "timestamp": "2025-09-05 09:15:40.091771", "step": 3476, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:40.286274", "step": 3476, "epoch": 3 }, { "type": "loss", "content": 0.2943912446498871, "timestamp": "2025-09-05 09:15:40.288213", "step": 3477, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:40.490989", "step": 3477, "epoch": 3 }, { "type": "loss", "content": 0.21967966854572296, "timestamp": "2025-09-05 09:15:40.492987", "step": 3478, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:40.688577", "step": 3478, "epoch": 3 }, { "type": "loss", "content": 0.1738353669643402, "timestamp": "2025-09-05 09:15:40.691098", "step": 3479, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:40.894679", "step": 3479, "epoch": 3 }, { "type": "loss", "content": 0.3198263347148895, "timestamp": "2025-09-05 09:15:40.908795", "step": 3480, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:15:45.541745", "step": 3480, "epoch": 3 }, { "type": "pplx", "content": 57.259662380369484, "timestamp": "2025-09-05 09:15:45.543529", "step": 3480, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3480", "timestamp": "2025-09-05 09:15:45.993651", "step": 3480, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:15:46.154598", "step": 3480, "epoch": 3 }, { "type": "loss", "content": 0.30058109760284424, "timestamp": "2025-09-05 09:15:46.156858", "step": 3481, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:15:46.349969", "step": 3481, "epoch": 3 }, { "type": "loss", "content": 0.325216144323349, "timestamp": "2025-09-05 09:15:46.351846", "step": 3482, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:46.554582", "step": 3482, "epoch": 3 }, { "type": "loss", "content": 0.2780161499977112, "timestamp": "2025-09-05 09:15:46.556875", "step": 3483, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:46.760064", "step": 3483, "epoch": 3 }, { "type": "loss", "content": 0.1740078330039978, "timestamp": "2025-09-05 09:15:46.768946", "step": 3484, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:15:46.930918", "step": 3484, "epoch": 3 }, { "type": "loss", "content": 0.3463882803916931, "timestamp": "2025-09-05 09:15:46.933011", "step": 3485, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:47.137903", "step": 3485, "epoch": 3 }, { "type": "loss", "content": 0.1923578530550003, "timestamp": "2025-09-05 09:15:47.140077", "step": 3486, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:47.334418", "step": 3486, "epoch": 3 }, { "type": "loss", "content": 0.27616503834724426, "timestamp": "2025-09-05 09:15:47.336586", "step": 3487, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:47.541527", "step": 3487, "epoch": 3 }, { "type": "loss", "content": 0.475725382566452, "timestamp": "2025-09-05 09:15:47.550637", "step": 3488, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:47.714230", "step": 3488, "epoch": 3 }, { "type": "loss", "content": 0.42054125666618347, "timestamp": "2025-09-05 09:15:47.716194", "step": 3489, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:47.920059", "step": 3489, "epoch": 3 }, { "type": "loss", "content": 0.2236691415309906, "timestamp": "2025-09-05 09:15:47.922360", "step": 3490, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:48.117564", "step": 3490, "epoch": 3 }, { "type": "loss", "content": 0.21372617781162262, "timestamp": "2025-09-05 09:15:48.119899", "step": 3491, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:48.282364", "step": 3491, "epoch": 3 }, { "type": "loss", "content": 0.3831466734409332, "timestamp": "2025-09-05 09:15:48.296563", "step": 3492, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:48.480614", "step": 3492, "epoch": 3 }, { "type": "loss", "content": 0.4005196690559387, "timestamp": "2025-09-05 09:15:48.482752", "step": 3493, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:48.687366", "step": 3493, "epoch": 3 }, { "type": "loss", "content": 0.3020406663417816, "timestamp": "2025-09-05 09:15:48.689928", "step": 3494, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:48.882552", "step": 3494, "epoch": 3 }, { "type": "loss", "content": 0.33710694313049316, "timestamp": "2025-09-05 09:15:48.884841", "step": 3495, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:49.077262", "step": 3495, "epoch": 3 }, { "type": "loss", "content": 0.41189056634902954, "timestamp": "2025-09-05 09:15:49.093452", "step": 3496, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:49.285982", "step": 3496, "epoch": 3 }, { "type": "loss", "content": 0.3529796004295349, "timestamp": "2025-09-05 09:15:49.288044", "step": 3497, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:49.481074", "step": 3497, "epoch": 3 }, { "type": "loss", "content": 0.41452711820602417, "timestamp": "2025-09-05 09:15:49.483836", "step": 3498, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:49.685250", "step": 3498, "epoch": 3 }, { "type": "loss", "content": 0.23813626170158386, "timestamp": "2025-09-05 09:15:49.688159", "step": 3499, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:49.890585", "step": 3499, "epoch": 3 }, { "type": "loss", "content": 0.2140645980834961, "timestamp": "2025-09-05 09:15:49.904427", "step": 3500, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:15:54.532826", "step": 3500, "epoch": 3 }, { "type": "pplx", "content": 57.432321365212985, "timestamp": "2025-09-05 09:15:54.535264", "step": 3500, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:15:54.697969", "step": 3500, "epoch": 3 }, { "type": "loss", "content": 0.3301979899406433, "timestamp": "2025-09-05 09:15:54.700718", "step": 3501, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:54.868212", "step": 3501, "epoch": 3 }, { "type": "loss", "content": 0.25775179266929626, "timestamp": "2025-09-05 09:15:54.870710", "step": 3502, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:55.074541", "step": 3502, "epoch": 3 }, { "type": "loss", "content": 0.2093990296125412, "timestamp": "2025-09-05 09:15:55.076740", "step": 3503, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:55.270867", "step": 3503, "epoch": 3 }, { "type": "loss", "content": 0.23625101149082184, "timestamp": "2025-09-05 09:15:55.286009", "step": 3504, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:55.473909", "step": 3504, "epoch": 3 }, { "type": "loss", "content": 0.3592059016227722, "timestamp": "2025-09-05 09:15:55.475922", "step": 3505, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:55.679420", "step": 3505, "epoch": 3 }, { "type": "loss", "content": 0.2650335431098938, "timestamp": "2025-09-05 09:15:55.681455", "step": 3506, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:55.885416", "step": 3506, "epoch": 3 }, { "type": "loss", "content": 0.25605595111846924, "timestamp": "2025-09-05 09:15:55.887447", "step": 3507, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:56.082118", "step": 3507, "epoch": 3 }, { "type": "loss", "content": 0.2315322905778885, "timestamp": "2025-09-05 09:15:56.098407", "step": 3508, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:15:56.294219", "step": 3508, "epoch": 3 }, { "type": "loss", "content": 0.3174898028373718, "timestamp": "2025-09-05 09:15:56.296213", "step": 3509, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:56.499496", "step": 3509, "epoch": 3 }, { "type": "loss", "content": 0.3008250296115875, "timestamp": "2025-09-05 09:15:56.501873", "step": 3510, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:56.704909", "step": 3510, "epoch": 3 }, { "type": "loss", "content": 0.33064156770706177, "timestamp": "2025-09-05 09:15:56.707016", "step": 3511, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:15:56.902443", "step": 3511, "epoch": 3 }, { "type": "loss", "content": 0.2688453793525696, "timestamp": "2025-09-05 09:15:56.911436", "step": 3512, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:57.074020", "step": 3512, "epoch": 3 }, { "type": "loss", "content": 0.23117870092391968, "timestamp": "2025-09-05 09:15:57.075969", "step": 3513, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:15:57.280000", "step": 3513, "epoch": 3 }, { "type": "loss", "content": 0.17968448996543884, "timestamp": "2025-09-05 09:15:57.281980", "step": 3514, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:57.479408", "step": 3514, "epoch": 3 }, { "type": "loss", "content": 0.25697532296180725, "timestamp": "2025-09-05 09:15:57.481853", "step": 3515, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:57.685913", "step": 3515, "epoch": 3 }, { "type": "loss", "content": 0.29816049337387085, "timestamp": "2025-09-05 09:15:57.698968", "step": 3516, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:57.885343", "step": 3516, "epoch": 3 }, { "type": "loss", "content": 0.23597969114780426, "timestamp": "2025-09-05 09:15:57.887857", "step": 3517, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:15:58.093491", "step": 3517, "epoch": 3 }, { "type": "loss", "content": 0.26065388321876526, "timestamp": "2025-09-05 09:15:58.095763", "step": 3518, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:15:58.291219", "step": 3518, "epoch": 3 }, { "type": "loss", "content": 0.24879339337348938, "timestamp": "2025-09-05 09:15:58.293254", "step": 3519, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:15:58.489325", "step": 3519, "epoch": 3 }, { "type": "loss", "content": 0.2509574294090271, "timestamp": "2025-09-05 09:15:58.502474", "step": 3520, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:16:03.161672", "step": 3520, "epoch": 3 }, { "type": "pplx", "content": 57.248311386178514, "timestamp": "2025-09-05 09:16:03.164871", "step": 3520, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3520", "timestamp": "2025-09-05 09:16:03.773257", "step": 3520, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:03.938424", "step": 3520, "epoch": 3 }, { "type": "loss", "content": 0.24730339646339417, "timestamp": "2025-09-05 09:16:03.940606", "step": 3521, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:04.135104", "step": 3521, "epoch": 3 }, { "type": "loss", "content": 0.3603556752204895, "timestamp": "2025-09-05 09:16:04.137174", "step": 3522, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:04.332922", "step": 3522, "epoch": 3 }, { "type": "loss", "content": 0.27035149931907654, "timestamp": "2025-09-05 09:16:04.335173", "step": 3523, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:04.531971", "step": 3523, "epoch": 3 }, { "type": "loss", "content": 0.19072581827640533, "timestamp": "2025-09-05 09:16:04.546028", "step": 3524, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:04.732763", "step": 3524, "epoch": 3 }, { "type": "loss", "content": 0.4346585273742676, "timestamp": "2025-09-05 09:16:04.735731", "step": 3525, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:04.929976", "step": 3525, "epoch": 3 }, { "type": "loss", "content": 0.30274319648742676, "timestamp": "2025-09-05 09:16:04.932300", "step": 3526, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:05.128812", "step": 3526, "epoch": 3 }, { "type": "loss", "content": 0.2595900893211365, "timestamp": "2025-09-05 09:16:05.130772", "step": 3527, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:05.324734", "step": 3527, "epoch": 3 }, { "type": "loss", "content": 0.20915935933589935, "timestamp": "2025-09-05 09:16:05.340930", "step": 3528, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:05.536897", "step": 3528, "epoch": 3 }, { "type": "loss", "content": 0.26858463883399963, "timestamp": "2025-09-05 09:16:05.539280", "step": 3529, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:05.705787", "step": 3529, "epoch": 3 }, { "type": "loss", "content": 0.35047516226768494, "timestamp": "2025-09-05 09:16:05.708011", "step": 3530, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:05.911338", "step": 3530, "epoch": 3 }, { "type": "loss", "content": 0.2243708372116089, "timestamp": "2025-09-05 09:16:05.913729", "step": 3531, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:06.082348", "step": 3531, "epoch": 3 }, { "type": "loss", "content": 0.19937127828598022, "timestamp": "2025-09-05 09:16:06.098218", "step": 3532, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:06.294245", "step": 3532, "epoch": 3 }, { "type": "loss", "content": 0.17561663687229156, "timestamp": "2025-09-05 09:16:06.296290", "step": 3533, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:06.497951", "step": 3533, "epoch": 3 }, { "type": "loss", "content": 0.1659424901008606, "timestamp": "2025-09-05 09:16:06.499835", "step": 3534, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:06.696327", "step": 3534, "epoch": 3 }, { "type": "loss", "content": 0.28922516107559204, "timestamp": "2025-09-05 09:16:06.699244", "step": 3535, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:06.896587", "step": 3535, "epoch": 3 }, { "type": "loss", "content": 0.13516835868358612, "timestamp": "2025-09-05 09:16:06.911478", "step": 3536, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:07.110059", "step": 3536, "epoch": 3 }, { "type": "loss", "content": 0.217886820435524, "timestamp": "2025-09-05 09:16:07.112524", "step": 3537, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:07.308950", "step": 3537, "epoch": 3 }, { "type": "loss", "content": 0.2815990149974823, "timestamp": "2025-09-05 09:16:07.310854", "step": 3538, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:07.477125", "step": 3538, "epoch": 3 }, { "type": "loss", "content": 0.34248197078704834, "timestamp": "2025-09-05 09:16:07.479196", "step": 3539, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:07.683880", "step": 3539, "epoch": 3 }, { "type": "loss", "content": 0.35142749547958374, "timestamp": "2025-09-05 09:16:07.700213", "step": 3540, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:16:12.334278", "step": 3540, "epoch": 3 }, { "type": "pplx", "content": 57.18878524249908, "timestamp": "2025-09-05 09:16:12.336402", "step": 3540, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:12.495423", "step": 3540, "epoch": 3 }, { "type": "loss", "content": 0.2691498398780823, "timestamp": "2025-09-05 09:16:12.497432", "step": 3541, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:12.703100", "step": 3541, "epoch": 3 }, { "type": "loss", "content": 0.21435484290122986, "timestamp": "2025-09-05 09:16:12.705330", "step": 3542, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:12.901321", "step": 3542, "epoch": 3 }, { "type": "loss", "content": 0.257010281085968, "timestamp": "2025-09-05 09:16:12.903276", "step": 3543, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:13.107816", "step": 3543, "epoch": 3 }, { "type": "loss", "content": 0.31392380595207214, "timestamp": "2025-09-05 09:16:13.121641", "step": 3544, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:13.316878", "step": 3544, "epoch": 3 }, { "type": "loss", "content": 0.2716064751148224, "timestamp": "2025-09-05 09:16:13.318958", "step": 3545, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:13.483554", "step": 3545, "epoch": 3 }, { "type": "loss", "content": 0.33304086327552795, "timestamp": "2025-09-05 09:16:13.486119", "step": 3546, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:16:13.688428", "step": 3546, "epoch": 3 }, { "type": "loss", "content": 0.3453178405761719, "timestamp": "2025-09-05 09:16:13.690938", "step": 3547, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:13.890286", "step": 3547, "epoch": 3 }, { "type": "loss", "content": 0.3486884832382202, "timestamp": "2025-09-05 09:16:13.907139", "step": 3548, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:16:14.103328", "step": 3548, "epoch": 3 }, { "type": "loss", "content": 0.44889482855796814, "timestamp": "2025-09-05 09:16:14.106268", "step": 3549, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:14.302303", "step": 3549, "epoch": 3 }, { "type": "loss", "content": 0.2349972277879715, "timestamp": "2025-09-05 09:16:14.304333", "step": 3550, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:14.503881", "step": 3550, "epoch": 3 }, { "type": "loss", "content": 0.18555277585983276, "timestamp": "2025-09-05 09:16:14.505559", "step": 3551, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:14.701071", "step": 3551, "epoch": 3 }, { "type": "loss", "content": 0.34614723920822144, "timestamp": "2025-09-05 09:16:14.714520", "step": 3552, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:14.902023", "step": 3552, "epoch": 3 }, { "type": "loss", "content": 0.22294148802757263, "timestamp": "2025-09-05 09:16:14.903759", "step": 3553, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:15.099938", "step": 3553, "epoch": 3 }, { "type": "loss", "content": 0.3558078706264496, "timestamp": "2025-09-05 09:16:15.102509", "step": 3554, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:15.295938", "step": 3554, "epoch": 3 }, { "type": "loss", "content": 0.20311148464679718, "timestamp": "2025-09-05 09:16:15.297998", "step": 3555, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:15.492813", "step": 3555, "epoch": 3 }, { "type": "loss", "content": 0.2172691971063614, "timestamp": "2025-09-05 09:16:15.508345", "step": 3556, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:15.702178", "step": 3556, "epoch": 3 }, { "type": "loss", "content": 0.29311197996139526, "timestamp": "2025-09-05 09:16:15.704350", "step": 3557, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:15.907404", "step": 3557, "epoch": 3 }, { "type": "loss", "content": 0.31963905692100525, "timestamp": "2025-09-05 09:16:15.909545", "step": 3558, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:16.111802", "step": 3558, "epoch": 3 }, { "type": "loss", "content": 0.47033655643463135, "timestamp": "2025-09-05 09:16:16.113736", "step": 3559, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:16:16.307815", "step": 3559, "epoch": 3 }, { "type": "loss", "content": 0.34976720809936523, "timestamp": "2025-09-05 09:16:16.321703", "step": 3560, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:16:21.188059", "step": 3560, "epoch": 3 }, { "type": "pplx", "content": 57.505925013612185, "timestamp": "2025-09-05 09:16:21.190112", "step": 3560, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3560", "timestamp": "2025-09-05 09:16:21.625686", "step": 3560, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:21.786287", "step": 3560, "epoch": 3 }, { "type": "loss", "content": 0.33070307970046997, "timestamp": "2025-09-05 09:16:21.788767", "step": 3561, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:21.953471", "step": 3561, "epoch": 3 }, { "type": "loss", "content": 0.2531128227710724, "timestamp": "2025-09-05 09:16:21.955557", "step": 3562, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:22.148695", "step": 3562, "epoch": 3 }, { "type": "loss", "content": 0.3364905118942261, "timestamp": "2025-09-05 09:16:22.150816", "step": 3563, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:22.321957", "step": 3563, "epoch": 3 }, { "type": "loss", "content": 0.2518615424633026, "timestamp": "2025-09-05 09:16:22.335784", "step": 3564, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:22.489115", "step": 3564, "epoch": 3 }, { "type": "loss", "content": 0.35287201404571533, "timestamp": "2025-09-05 09:16:22.491003", "step": 3565, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:22.649868", "step": 3565, "epoch": 3 }, { "type": "loss", "content": 0.31937721371650696, "timestamp": "2025-09-05 09:16:22.651695", "step": 3566, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:22.821112", "step": 3566, "epoch": 3 }, { "type": "loss", "content": 0.329274445772171, "timestamp": "2025-09-05 09:16:22.823418", "step": 3567, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:22.981071", "step": 3567, "epoch": 3 }, { "type": "loss", "content": 0.23146797716617584, "timestamp": "2025-09-05 09:16:22.994974", "step": 3568, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:16:23.147220", "step": 3568, "epoch": 3 }, { "type": "loss", "content": 0.3371192514896393, "timestamp": "2025-09-05 09:16:23.149264", "step": 3569, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:23.309883", "step": 3569, "epoch": 3 }, { "type": "loss", "content": 0.30641764402389526, "timestamp": "2025-09-05 09:16:23.312208", "step": 3570, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:23.470545", "step": 3570, "epoch": 3 }, { "type": "loss", "content": 0.3931610882282257, "timestamp": "2025-09-05 09:16:23.472232", "step": 3571, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:23.628899", "step": 3571, "epoch": 3 }, { "type": "loss", "content": 0.2743593454360962, "timestamp": "2025-09-05 09:16:23.642205", "step": 3572, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:23.793306", "step": 3572, "epoch": 3 }, { "type": "loss", "content": 0.19696570932865143, "timestamp": "2025-09-05 09:16:23.795320", "step": 3573, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:23.964282", "step": 3573, "epoch": 3 }, { "type": "loss", "content": 0.31469157338142395, "timestamp": "2025-09-05 09:16:23.966092", "step": 3574, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:24.124842", "step": 3574, "epoch": 3 }, { "type": "loss", "content": 0.35265278816223145, "timestamp": "2025-09-05 09:16:24.126761", "step": 3575, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:24.285175", "step": 3575, "epoch": 3 }, { "type": "loss", "content": 0.2801409959793091, "timestamp": "2025-09-05 09:16:24.299057", "step": 3576, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:24.450733", "step": 3576, "epoch": 3 }, { "type": "loss", "content": 0.26633474230766296, "timestamp": "2025-09-05 09:16:24.453513", "step": 3577, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:24.610965", "step": 3577, "epoch": 3 }, { "type": "loss", "content": 0.3046843707561493, "timestamp": "2025-09-05 09:16:24.612825", "step": 3578, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:24.769922", "step": 3578, "epoch": 3 }, { "type": "loss", "content": 0.3721226453781128, "timestamp": "2025-09-05 09:16:24.771766", "step": 3579, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:24.931204", "step": 3579, "epoch": 3 }, { "type": "loss", "content": 0.264265239238739, "timestamp": "2025-09-05 09:16:24.945012", "step": 3580, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:16:29.567482", "step": 3580, "epoch": 3 }, { "type": "pplx", "content": 57.221559294521626, "timestamp": "2025-09-05 09:16:29.569634", "step": 3580, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:29.702774", "step": 3580, "epoch": 3 }, { "type": "loss", "content": 0.27857106924057007, "timestamp": "2025-09-05 09:16:29.705099", "step": 3581, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:29.841394", "step": 3581, "epoch": 3 }, { "type": "loss", "content": 0.35566446185112, "timestamp": "2025-09-05 09:16:29.843601", "step": 3582, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:30.016043", "step": 3582, "epoch": 3 }, { "type": "loss", "content": 0.2681717574596405, "timestamp": "2025-09-05 09:16:30.018309", "step": 3583, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:30.183556", "step": 3583, "epoch": 3 }, { "type": "loss", "content": 0.29908114671707153, "timestamp": "2025-09-05 09:16:30.199639", "step": 3584, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:30.363380", "step": 3584, "epoch": 3 }, { "type": "loss", "content": 0.32012149691581726, "timestamp": "2025-09-05 09:16:30.365411", "step": 3585, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:30.534700", "step": 3585, "epoch": 3 }, { "type": "loss", "content": 0.1617680937051773, "timestamp": "2025-09-05 09:16:30.536749", "step": 3586, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:30.711043", "step": 3586, "epoch": 3 }, { "type": "loss", "content": 0.18636199831962585, "timestamp": "2025-09-05 09:16:30.713254", "step": 3587, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:30.877559", "step": 3587, "epoch": 3 }, { "type": "loss", "content": 0.26028791069984436, "timestamp": "2025-09-05 09:16:30.891716", "step": 3588, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:31.048008", "step": 3588, "epoch": 3 }, { "type": "loss", "content": 0.23478567600250244, "timestamp": "2025-09-05 09:16:31.050538", "step": 3589, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:31.214481", "step": 3589, "epoch": 3 }, { "type": "loss", "content": 0.25303134322166443, "timestamp": "2025-09-05 09:16:31.216538", "step": 3590, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:31.353478", "step": 3590, "epoch": 3 }, { "type": "loss", "content": 0.3114836812019348, "timestamp": "2025-09-05 09:16:31.355987", "step": 3591, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:31.491787", "step": 3591, "epoch": 3 }, { "type": "loss", "content": 0.341848224401474, "timestamp": "2025-09-05 09:16:31.507779", "step": 3592, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:31.672999", "step": 3592, "epoch": 3 }, { "type": "loss", "content": 0.34669381380081177, "timestamp": "2025-09-05 09:16:31.675020", "step": 3593, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:31.839374", "step": 3593, "epoch": 3 }, { "type": "loss", "content": 0.24734577536582947, "timestamp": "2025-09-05 09:16:31.841260", "step": 3594, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:31.979773", "step": 3594, "epoch": 3 }, { "type": "loss", "content": 0.2852213978767395, "timestamp": "2025-09-05 09:16:31.982039", "step": 3595, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:32.154180", "step": 3595, "epoch": 3 }, { "type": "loss", "content": 0.20489144325256348, "timestamp": "2025-09-05 09:16:32.168329", "step": 3596, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:32.328468", "step": 3596, "epoch": 3 }, { "type": "loss", "content": 0.2507217824459076, "timestamp": "2025-09-05 09:16:32.330751", "step": 3597, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:16:32.494693", "step": 3597, "epoch": 3 }, { "type": "loss", "content": 0.29786399006843567, "timestamp": "2025-09-05 09:16:32.496588", "step": 3598, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:32.669476", "step": 3598, "epoch": 3 }, { "type": "loss", "content": 0.31531691551208496, "timestamp": "2025-09-05 09:16:32.671583", "step": 3599, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:32.845699", "step": 3599, "epoch": 3 }, { "type": "loss", "content": 0.28864872455596924, "timestamp": "2025-09-05 09:16:32.862166", "step": 3600, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:16:37.515636", "step": 3600, "epoch": 3 }, { "type": "pplx", "content": 56.849603424621414, "timestamp": "2025-09-05 09:16:37.517453", "step": 3600, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3600", "timestamp": "2025-09-05 09:16:37.973586", "step": 3600, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:38.113656", "step": 3600, "epoch": 3 }, { "type": "loss", "content": 0.3407289683818817, "timestamp": "2025-09-05 09:16:38.115499", "step": 3601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:38.313738", "step": 3601, "epoch": 3 }, { "type": "loss", "content": 0.2807539999485016, "timestamp": "2025-09-05 09:16:38.315728", "step": 3602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:38.481679", "step": 3602, "epoch": 3 }, { "type": "loss", "content": 0.2940472662448883, "timestamp": "2025-09-05 09:16:38.483627", "step": 3603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:38.650368", "step": 3603, "epoch": 3 }, { "type": "loss", "content": 0.3419131934642792, "timestamp": "2025-09-05 09:16:38.663614", "step": 3604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:38.821500", "step": 3604, "epoch": 3 }, { "type": "loss", "content": 0.2667962610721588, "timestamp": "2025-09-05 09:16:38.824940", "step": 3605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:38.989583", "step": 3605, "epoch": 3 }, { "type": "loss", "content": 0.3813531696796417, "timestamp": "2025-09-05 09:16:38.991762", "step": 3606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:39.156661", "step": 3606, "epoch": 3 }, { "type": "loss", "content": 0.21620894968509674, "timestamp": "2025-09-05 09:16:39.158848", "step": 3607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:39.322673", "step": 3607, "epoch": 3 }, { "type": "loss", "content": 0.28690725564956665, "timestamp": "2025-09-05 09:16:39.337856", "step": 3608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:39.504144", "step": 3608, "epoch": 3 }, { "type": "loss", "content": 0.2018442302942276, "timestamp": "2025-09-05 09:16:39.506367", "step": 3609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:39.694573", "step": 3609, "epoch": 3 }, { "type": "loss", "content": 0.24246446788311005, "timestamp": "2025-09-05 09:16:39.696675", "step": 3610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:39.903678", "step": 3610, "epoch": 3 }, { "type": "loss", "content": 0.2503988444805145, "timestamp": "2025-09-05 09:16:39.905565", "step": 3611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:40.101897", "step": 3611, "epoch": 3 }, { "type": "loss", "content": 0.2636902928352356, "timestamp": "2025-09-05 09:16:40.114992", "step": 3612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:40.309381", "step": 3612, "epoch": 3 }, { "type": "loss", "content": 0.2856784164905548, "timestamp": "2025-09-05 09:16:40.311717", "step": 3613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:40.507200", "step": 3613, "epoch": 3 }, { "type": "loss", "content": 0.21770760416984558, "timestamp": "2025-09-05 09:16:40.509303", "step": 3614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:40.712550", "step": 3614, "epoch": 3 }, { "type": "loss", "content": 0.3094342350959778, "timestamp": "2025-09-05 09:16:40.714794", "step": 3615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:40.921171", "step": 3615, "epoch": 3 }, { "type": "loss", "content": 0.22260645031929016, "timestamp": "2025-09-05 09:16:40.937490", "step": 3616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:41.132556", "step": 3616, "epoch": 3 }, { "type": "loss", "content": 0.2806592881679535, "timestamp": "2025-09-05 09:16:41.134659", "step": 3617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:41.330202", "step": 3617, "epoch": 3 }, { "type": "loss", "content": 0.2558315098285675, "timestamp": "2025-09-05 09:16:41.332341", "step": 3618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:41.535714", "step": 3618, "epoch": 3 }, { "type": "loss", "content": 0.29388701915740967, "timestamp": "2025-09-05 09:16:41.538011", "step": 3619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:41.735923", "step": 3619, "epoch": 3 }, { "type": "loss", "content": 0.233632892370224, "timestamp": "2025-09-05 09:16:41.750243", "step": 3620, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:16:46.373597", "step": 3620, "epoch": 3 }, { "type": "pplx", "content": 56.65468023616325, "timestamp": "2025-09-05 09:16:46.375753", "step": 3620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:46.536738", "step": 3620, "epoch": 3 }, { "type": "loss", "content": 0.3380981683731079, "timestamp": "2025-09-05 09:16:46.539311", "step": 3621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:46.703329", "step": 3621, "epoch": 3 }, { "type": "loss", "content": 0.44205331802368164, "timestamp": "2025-09-05 09:16:46.705667", "step": 3622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:46.911147", "step": 3622, "epoch": 3 }, { "type": "loss", "content": 0.22291947901248932, "timestamp": "2025-09-05 09:16:46.913490", "step": 3623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:47.117238", "step": 3623, "epoch": 3 }, { "type": "loss", "content": 0.2371853142976761, "timestamp": "2025-09-05 09:16:47.134683", "step": 3624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:47.325947", "step": 3624, "epoch": 3 }, { "type": "loss", "content": 0.3914877772331238, "timestamp": "2025-09-05 09:16:47.327818", "step": 3625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:47.522207", "step": 3625, "epoch": 3 }, { "type": "loss", "content": 0.18587626516819, "timestamp": "2025-09-05 09:16:47.524016", "step": 3626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:47.727008", "step": 3626, "epoch": 3 }, { "type": "loss", "content": 0.33849427103996277, "timestamp": "2025-09-05 09:16:47.728996", "step": 3627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:47.932545", "step": 3627, "epoch": 3 }, { "type": "loss", "content": 0.30909058451652527, "timestamp": "2025-09-05 09:16:47.948909", "step": 3628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:48.145519", "step": 3628, "epoch": 3 }, { "type": "loss", "content": 0.3659791350364685, "timestamp": "2025-09-05 09:16:48.147519", "step": 3629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:48.352926", "step": 3629, "epoch": 3 }, { "type": "loss", "content": 0.33907654881477356, "timestamp": "2025-09-05 09:16:48.354868", "step": 3630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:48.549234", "step": 3630, "epoch": 3 }, { "type": "loss", "content": 0.20109602808952332, "timestamp": "2025-09-05 09:16:48.551245", "step": 3631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:48.756040", "step": 3631, "epoch": 3 }, { "type": "loss", "content": 0.20491234958171844, "timestamp": "2025-09-05 09:16:48.769919", "step": 3632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:48.959793", "step": 3632, "epoch": 3 }, { "type": "loss", "content": 0.22076480090618134, "timestamp": "2025-09-05 09:16:48.961632", "step": 3633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:49.156216", "step": 3633, "epoch": 3 }, { "type": "loss", "content": 0.3952508568763733, "timestamp": "2025-09-05 09:16:49.158140", "step": 3634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:16:49.353843", "step": 3634, "epoch": 3 }, { "type": "loss", "content": 0.365893691778183, "timestamp": "2025-09-05 09:16:49.356162", "step": 3635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:49.552509", "step": 3635, "epoch": 3 }, { "type": "loss", "content": 0.4604712426662445, "timestamp": "2025-09-05 09:16:49.566332", "step": 3636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:49.753640", "step": 3636, "epoch": 3 }, { "type": "loss", "content": 0.2598891258239746, "timestamp": "2025-09-05 09:16:49.755677", "step": 3637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:49.952092", "step": 3637, "epoch": 3 }, { "type": "loss", "content": 0.19562163949012756, "timestamp": "2025-09-05 09:16:49.954362", "step": 3638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:16:50.150810", "step": 3638, "epoch": 3 }, { "type": "loss", "content": 0.3006066679954529, "timestamp": "2025-09-05 09:16:50.152516", "step": 3639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:50.357244", "step": 3639, "epoch": 3 }, { "type": "loss", "content": 0.26009029150009155, "timestamp": "2025-09-05 09:16:50.370461", "step": 3640, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:16:55.002230", "step": 3640, "epoch": 3 }, { "type": "pplx", "content": 57.14356651280716, "timestamp": "2025-09-05 09:16:55.003937", "step": 3640, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3640", "timestamp": "2025-09-05 09:16:55.470108", "step": 3640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:55.637860", "step": 3640, "epoch": 3 }, { "type": "loss", "content": 0.33125096559524536, "timestamp": "2025-09-05 09:16:55.640180", "step": 3641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:55.834264", "step": 3641, "epoch": 3 }, { "type": "loss", "content": 0.29791468381881714, "timestamp": "2025-09-05 09:16:55.836217", "step": 3642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:56.037864", "step": 3642, "epoch": 3 }, { "type": "loss", "content": 0.22799059748649597, "timestamp": "2025-09-05 09:16:56.040517", "step": 3643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:56.237007", "step": 3643, "epoch": 3 }, { "type": "loss", "content": 0.19178558886051178, "timestamp": "2025-09-05 09:16:56.251290", "step": 3644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:56.437268", "step": 3644, "epoch": 3 }, { "type": "loss", "content": 0.2333633154630661, "timestamp": "2025-09-05 09:16:56.439316", "step": 3645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:16:56.641884", "step": 3645, "epoch": 3 }, { "type": "loss", "content": 0.3371458053588867, "timestamp": "2025-09-05 09:16:56.643663", "step": 3646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:16:56.838438", "step": 3646, "epoch": 3 }, { "type": "loss", "content": 0.2189522385597229, "timestamp": "2025-09-05 09:16:56.840576", "step": 3647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:57.036412", "step": 3647, "epoch": 3 }, { "type": "loss", "content": 0.20619438588619232, "timestamp": "2025-09-05 09:16:57.052386", "step": 3648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:57.247348", "step": 3648, "epoch": 3 }, { "type": "loss", "content": 0.3777850568294525, "timestamp": "2025-09-05 09:16:57.249655", "step": 3649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:57.453023", "step": 3649, "epoch": 3 }, { "type": "loss", "content": 0.19339697062969208, "timestamp": "2025-09-05 09:16:57.455107", "step": 3650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:57.650129", "step": 3650, "epoch": 3 }, { "type": "loss", "content": 0.24509845674037933, "timestamp": "2025-09-05 09:16:57.653034", "step": 3651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:57.851212", "step": 3651, "epoch": 3 }, { "type": "loss", "content": 0.20124991238117218, "timestamp": "2025-09-05 09:16:57.865147", "step": 3652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:58.053115", "step": 3652, "epoch": 3 }, { "type": "loss", "content": 0.2905905246734619, "timestamp": "2025-09-05 09:16:58.055367", "step": 3653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:58.259967", "step": 3653, "epoch": 3 }, { "type": "loss", "content": 0.3718425929546356, "timestamp": "2025-09-05 09:16:58.262379", "step": 3654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:58.469732", "step": 3654, "epoch": 3 }, { "type": "loss", "content": 0.278747022151947, "timestamp": "2025-09-05 09:16:58.472249", "step": 3655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:58.637485", "step": 3655, "epoch": 3 }, { "type": "loss", "content": 0.15937921404838562, "timestamp": "2025-09-05 09:16:58.654440", "step": 3656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:58.853334", "step": 3656, "epoch": 3 }, { "type": "loss", "content": 0.25874295830726624, "timestamp": "2025-09-05 09:16:58.855297", "step": 3657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:16:59.060556", "step": 3657, "epoch": 3 }, { "type": "loss", "content": 0.29497626423835754, "timestamp": "2025-09-05 09:16:59.062998", "step": 3658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:16:59.260298", "step": 3658, "epoch": 3 }, { "type": "loss", "content": 0.31460994482040405, "timestamp": "2025-09-05 09:16:59.262889", "step": 3659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:16:59.468470", "step": 3659, "epoch": 3 }, { "type": "loss", "content": 0.23476682603359222, "timestamp": "2025-09-05 09:16:59.482345", "step": 3660, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:17:04.179168", "step": 3660, "epoch": 3 }, { "type": "pplx", "content": 58.84672962747659, "timestamp": "2025-09-05 09:17:04.183501", "step": 3660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:17:04.345280", "step": 3660, "epoch": 3 }, { "type": "loss", "content": 0.2967011034488678, "timestamp": "2025-09-05 09:17:04.347457", "step": 3661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:04.486037", "step": 3661, "epoch": 3 }, { "type": "loss", "content": 0.20157547295093536, "timestamp": "2025-09-05 09:17:04.491324", "step": 3662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:04.669365", "step": 3662, "epoch": 3 }, { "type": "loss", "content": 0.29696306586265564, "timestamp": "2025-09-05 09:17:04.671800", "step": 3663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:04.836191", "step": 3663, "epoch": 3 }, { "type": "loss", "content": 0.31658288836479187, "timestamp": "2025-09-05 09:17:04.845138", "step": 3664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:04.981361", "step": 3664, "epoch": 3 }, { "type": "loss", "content": 0.31298524141311646, "timestamp": "2025-09-05 09:17:04.986225", "step": 3665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:05.204135", "step": 3665, "epoch": 3 }, { "type": "loss", "content": 0.24473224580287933, "timestamp": "2025-09-05 09:17:05.207421", "step": 3666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:05.386416", "step": 3666, "epoch": 3 }, { "type": "loss", "content": 0.30507656931877136, "timestamp": "2025-09-05 09:17:05.388406", "step": 3667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:05.562709", "step": 3667, "epoch": 3 }, { "type": "loss", "content": 0.3829200565814972, "timestamp": "2025-09-05 09:17:05.572358", "step": 3668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:17:05.707364", "step": 3668, "epoch": 3 }, { "type": "loss", "content": 0.21303744614124298, "timestamp": "2025-09-05 09:17:05.709248", "step": 3669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:05.873112", "step": 3669, "epoch": 3 }, { "type": "loss", "content": 0.19828033447265625, "timestamp": "2025-09-05 09:17:05.875438", "step": 3670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:06.044585", "step": 3670, "epoch": 3 }, { "type": "loss", "content": 0.19999150931835175, "timestamp": "2025-09-05 09:17:06.046676", "step": 3671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:17:06.210600", "step": 3671, "epoch": 3 }, { "type": "loss", "content": 0.20807354152202606, "timestamp": "2025-09-05 09:17:06.224093", "step": 3672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:06.381194", "step": 3672, "epoch": 3 }, { "type": "loss", "content": 0.337863951921463, "timestamp": "2025-09-05 09:17:06.383013", "step": 3673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:06.548655", "step": 3673, "epoch": 3 }, { "type": "loss", "content": 0.43848153948783875, "timestamp": "2025-09-05 09:17:06.550812", "step": 3674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:06.726192", "step": 3674, "epoch": 3 }, { "type": "loss", "content": 0.3983319401741028, "timestamp": "2025-09-05 09:17:06.728124", "step": 3675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:06.902251", "step": 3675, "epoch": 3 }, { "type": "loss", "content": 0.18885241448879242, "timestamp": "2025-09-05 09:17:06.917409", "step": 3676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:07.095542", "step": 3676, "epoch": 3 }, { "type": "loss", "content": 0.22875548899173737, "timestamp": "2025-09-05 09:17:07.097896", "step": 3677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:07.261387", "step": 3677, "epoch": 3 }, { "type": "loss", "content": 0.19291433691978455, "timestamp": "2025-09-05 09:17:07.263485", "step": 3678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:07.436357", "step": 3678, "epoch": 3 }, { "type": "loss", "content": 0.2069120854139328, "timestamp": "2025-09-05 09:17:07.438453", "step": 3679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:07.613336", "step": 3679, "epoch": 3 }, { "type": "loss", "content": 0.27292466163635254, "timestamp": "2025-09-05 09:17:07.627569", "step": 3680, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:17:12.442469", "step": 3680, "epoch": 3 }, { "type": "pplx", "content": 58.00292293677735, "timestamp": "2025-09-05 09:17:12.444593", "step": 3680, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3680", "timestamp": "2025-09-05 09:17:12.923472", "step": 3680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:13.096349", "step": 3680, "epoch": 3 }, { "type": "loss", "content": 0.28774937987327576, "timestamp": "2025-09-05 09:17:13.099635", "step": 3681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:13.270133", "step": 3681, "epoch": 3 }, { "type": "loss", "content": 0.29541826248168945, "timestamp": "2025-09-05 09:17:13.272636", "step": 3682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:13.446220", "step": 3682, "epoch": 3 }, { "type": "loss", "content": 0.20348146557807922, "timestamp": "2025-09-05 09:17:13.448403", "step": 3683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:13.621708", "step": 3683, "epoch": 3 }, { "type": "loss", "content": 0.27651840448379517, "timestamp": "2025-09-05 09:17:13.637686", "step": 3684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:13.803773", "step": 3684, "epoch": 3 }, { "type": "loss", "content": 0.16801585257053375, "timestamp": "2025-09-05 09:17:13.811331", "step": 3685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:13.977709", "step": 3685, "epoch": 3 }, { "type": "loss", "content": 0.27807697653770447, "timestamp": "2025-09-05 09:17:13.990085", "step": 3686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:14.174891", "step": 3686, "epoch": 3 }, { "type": "loss", "content": 0.28929510712623596, "timestamp": "2025-09-05 09:17:14.176862", "step": 3687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:14.342954", "step": 3687, "epoch": 3 }, { "type": "loss", "content": 0.33495303988456726, "timestamp": "2025-09-05 09:17:14.359164", "step": 3688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:14.525050", "step": 3688, "epoch": 3 }, { "type": "loss", "content": 0.40975499153137207, "timestamp": "2025-09-05 09:17:14.530323", "step": 3689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:17:14.707766", "step": 3689, "epoch": 3 }, { "type": "loss", "content": 0.34368452429771423, "timestamp": "2025-09-05 09:17:14.711194", "step": 3690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:14.885111", "step": 3690, "epoch": 3 }, { "type": "loss", "content": 0.21246421337127686, "timestamp": "2025-09-05 09:17:14.888437", "step": 3691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:15.063534", "step": 3691, "epoch": 3 }, { "type": "loss", "content": 0.17844977974891663, "timestamp": "2025-09-05 09:17:15.079004", "step": 3692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:15.247480", "step": 3692, "epoch": 3 }, { "type": "loss", "content": 0.23904670774936676, "timestamp": "2025-09-05 09:17:15.250098", "step": 3693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:17:15.419302", "step": 3693, "epoch": 3 }, { "type": "loss", "content": 0.2476682811975479, "timestamp": "2025-09-05 09:17:15.421347", "step": 3694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:15.585971", "step": 3694, "epoch": 3 }, { "type": "loss", "content": 0.14145521819591522, "timestamp": "2025-09-05 09:17:15.587978", "step": 3695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:15.755238", "step": 3695, "epoch": 3 }, { "type": "loss", "content": 0.2892580032348633, "timestamp": "2025-09-05 09:17:15.769585", "step": 3696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:15.933810", "step": 3696, "epoch": 3 }, { "type": "loss", "content": 0.20448994636535645, "timestamp": "2025-09-05 09:17:15.935738", "step": 3697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:16.110181", "step": 3697, "epoch": 3 }, { "type": "loss", "content": 0.30457159876823425, "timestamp": "2025-09-05 09:17:16.112415", "step": 3698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:16.278273", "step": 3698, "epoch": 3 }, { "type": "loss", "content": 0.19800890982151031, "timestamp": "2025-09-05 09:17:16.281130", "step": 3699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:16.447546", "step": 3699, "epoch": 3 }, { "type": "loss", "content": 0.10996751487255096, "timestamp": "2025-09-05 09:17:16.464001", "step": 3700, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:17:21.189675", "step": 3700, "epoch": 3 }, { "type": "pplx", "content": 56.65837507121739, "timestamp": "2025-09-05 09:17:21.192210", "step": 3700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:21.325124", "step": 3700, "epoch": 3 }, { "type": "loss", "content": 0.35211700201034546, "timestamp": "2025-09-05 09:17:21.328555", "step": 3701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:21.478549", "step": 3701, "epoch": 3 }, { "type": "loss", "content": 0.2543327510356903, "timestamp": "2025-09-05 09:17:21.483201", "step": 3702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:21.661615", "step": 3702, "epoch": 3 }, { "type": "loss", "content": 0.10399866104125977, "timestamp": "2025-09-05 09:17:21.663790", "step": 3703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:21.839507", "step": 3703, "epoch": 3 }, { "type": "loss", "content": 0.17878511548042297, "timestamp": "2025-09-05 09:17:21.853699", "step": 3704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:22.023690", "step": 3704, "epoch": 3 }, { "type": "loss", "content": 0.25016236305236816, "timestamp": "2025-09-05 09:17:22.026198", "step": 3705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:22.190767", "step": 3705, "epoch": 3 }, { "type": "loss", "content": 0.356041818857193, "timestamp": "2025-09-05 09:17:22.192750", "step": 3706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:22.353308", "step": 3706, "epoch": 3 }, { "type": "loss", "content": 0.31788352131843567, "timestamp": "2025-09-05 09:17:22.355417", "step": 3707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 5440033091648.0 }, "timestamp": "2025-09-05 09:17:22.513433", "step": 3707, "epoch": 3 }, { "type": "loss", "content": 0.4455195367336273, "timestamp": "2025-09-05 09:17:22.529158", "step": 3708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:22.694433", "step": 3708, "epoch": 3 }, { "type": "loss", "content": 0.1478017121553421, "timestamp": "2025-09-05 09:17:22.696732", "step": 3709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:22.865305", "step": 3709, "epoch": 3 }, { "type": "loss", "content": 0.30112531781196594, "timestamp": "2025-09-05 09:17:22.867849", "step": 3710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:23.040859", "step": 3710, "epoch": 3 }, { "type": "loss", "content": 0.24474984407424927, "timestamp": "2025-09-05 09:17:23.043598", "step": 3711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:23.202386", "step": 3711, "epoch": 3 }, { "type": "loss", "content": 0.2871285080909729, "timestamp": "2025-09-05 09:17:23.217262", "step": 3712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:23.370485", "step": 3712, "epoch": 3 }, { "type": "loss", "content": 0.2523558437824249, "timestamp": "2025-09-05 09:17:23.373047", "step": 3713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:23.533342", "step": 3713, "epoch": 3 }, { "type": "loss", "content": 0.1851450353860855, "timestamp": "2025-09-05 09:17:23.535368", "step": 3714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:23.712360", "step": 3714, "epoch": 3 }, { "type": "loss", "content": 0.41676124930381775, "timestamp": "2025-09-05 09:17:23.714662", "step": 3715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:23.886528", "step": 3715, "epoch": 3 }, { "type": "loss", "content": 0.28601813316345215, "timestamp": "2025-09-05 09:17:23.903673", "step": 3716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:17:24.067942", "step": 3716, "epoch": 3 }, { "type": "loss", "content": 0.2414342612028122, "timestamp": "2025-09-05 09:17:24.070264", "step": 3717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:24.236463", "step": 3717, "epoch": 3 }, { "type": "loss", "content": 0.28098100423812866, "timestamp": "2025-09-05 09:17:24.239880", "step": 3718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:24.398819", "step": 3718, "epoch": 3 }, { "type": "loss", "content": 0.2182604819536209, "timestamp": "2025-09-05 09:17:24.402058", "step": 3719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:24.569550", "step": 3719, "epoch": 3 }, { "type": "loss", "content": 0.21126191318035126, "timestamp": "2025-09-05 09:17:24.587286", "step": 3720, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:17:29.301300", "step": 3720, "epoch": 3 }, { "type": "pplx", "content": 57.851098754460935, "timestamp": "2025-09-05 09:17:29.303386", "step": 3720, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3720", "timestamp": "2025-09-05 09:17:29.780448", "step": 3720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:17:29.918245", "step": 3720, "epoch": 3 }, { "type": "loss", "content": 0.17629235982894897, "timestamp": "2025-09-05 09:17:29.920249", "step": 3721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:30.077145", "step": 3721, "epoch": 3 }, { "type": "loss", "content": 0.19920383393764496, "timestamp": "2025-09-05 09:17:30.080108", "step": 3722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:30.257935", "step": 3722, "epoch": 3 }, { "type": "loss", "content": 0.1470199078321457, "timestamp": "2025-09-05 09:17:30.260188", "step": 3723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:30.421805", "step": 3723, "epoch": 3 }, { "type": "loss", "content": 0.3522961735725403, "timestamp": "2025-09-05 09:17:30.434868", "step": 3724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:30.588266", "step": 3724, "epoch": 3 }, { "type": "loss", "content": 0.22870679199695587, "timestamp": "2025-09-05 09:17:30.590259", "step": 3725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:30.751751", "step": 3725, "epoch": 3 }, { "type": "loss", "content": 0.35698115825653076, "timestamp": "2025-09-05 09:17:30.754837", "step": 3726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:17:30.914440", "step": 3726, "epoch": 3 }, { "type": "loss", "content": 0.2308841049671173, "timestamp": "2025-09-05 09:17:30.916372", "step": 3727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:31.089362", "step": 3727, "epoch": 3 }, { "type": "loss", "content": 0.30304333567619324, "timestamp": "2025-09-05 09:17:31.103816", "step": 3728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:31.258454", "step": 3728, "epoch": 3 }, { "type": "loss", "content": 0.4719480872154236, "timestamp": "2025-09-05 09:17:31.261128", "step": 3729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:31.419266", "step": 3729, "epoch": 3 }, { "type": "loss", "content": 0.23239001631736755, "timestamp": "2025-09-05 09:17:31.421208", "step": 3730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:31.594334", "step": 3730, "epoch": 3 }, { "type": "loss", "content": 0.25132450461387634, "timestamp": "2025-09-05 09:17:31.596356", "step": 3731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:31.769554", "step": 3731, "epoch": 3 }, { "type": "loss", "content": 0.2377987802028656, "timestamp": "2025-09-05 09:17:31.782682", "step": 3732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:31.935284", "step": 3732, "epoch": 3 }, { "type": "loss", "content": 0.21928219497203827, "timestamp": "2025-09-05 09:17:31.937514", "step": 3733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:32.094333", "step": 3733, "epoch": 3 }, { "type": "loss", "content": 0.32776015996932983, "timestamp": "2025-09-05 09:17:32.096980", "step": 3734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:32.257921", "step": 3734, "epoch": 3 }, { "type": "loss", "content": 0.3067801594734192, "timestamp": "2025-09-05 09:17:32.260923", "step": 3735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:32.433004", "step": 3735, "epoch": 3 }, { "type": "loss", "content": 0.2571111023426056, "timestamp": "2025-09-05 09:17:32.450589", "step": 3736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:32.617665", "step": 3736, "epoch": 3 }, { "type": "loss", "content": 0.31286391615867615, "timestamp": "2025-09-05 09:17:32.621410", "step": 3737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:32.804127", "step": 3737, "epoch": 3 }, { "type": "loss", "content": 0.1266995221376419, "timestamp": "2025-09-05 09:17:32.806894", "step": 3738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:32.989557", "step": 3738, "epoch": 3 }, { "type": "loss", "content": 0.18480810523033142, "timestamp": "2025-09-05 09:17:32.993892", "step": 3739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:33.157154", "step": 3739, "epoch": 3 }, { "type": "loss", "content": 0.21677125990390778, "timestamp": "2025-09-05 09:17:33.174230", "step": 3740, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:17:37.915992", "step": 3740, "epoch": 3 }, { "type": "pplx", "content": 58.138925363076176, "timestamp": "2025-09-05 09:17:37.919958", "step": 3740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:38.054986", "step": 3740, "epoch": 3 }, { "type": "loss", "content": 0.17171865701675415, "timestamp": "2025-09-05 09:17:38.058251", "step": 3741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:38.194742", "step": 3741, "epoch": 3 }, { "type": "loss", "content": 0.11303048580884933, "timestamp": "2025-09-05 09:17:38.196873", "step": 3742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:38.367087", "step": 3742, "epoch": 3 }, { "type": "loss", "content": 0.29843342304229736, "timestamp": "2025-09-05 09:17:38.369146", "step": 3743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:38.528710", "step": 3743, "epoch": 3 }, { "type": "loss", "content": 0.28549709916114807, "timestamp": "2025-09-05 09:17:38.545287", "step": 3744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:38.704412", "step": 3744, "epoch": 3 }, { "type": "loss", "content": 0.3105151653289795, "timestamp": "2025-09-05 09:17:38.707561", "step": 3745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:38.870199", "step": 3745, "epoch": 3 }, { "type": "loss", "content": 0.2878493368625641, "timestamp": "2025-09-05 09:17:38.874012", "step": 3746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:39.047543", "step": 3746, "epoch": 3 }, { "type": "loss", "content": 0.31547924876213074, "timestamp": "2025-09-05 09:17:39.049871", "step": 3747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:17:39.220202", "step": 3747, "epoch": 3 }, { "type": "loss", "content": 0.4833051860332489, "timestamp": "2025-09-05 09:17:39.234546", "step": 3748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:39.394896", "step": 3748, "epoch": 3 }, { "type": "loss", "content": 0.21690459549427032, "timestamp": "2025-09-05 09:17:39.396991", "step": 3749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:39.567268", "step": 3749, "epoch": 3 }, { "type": "loss", "content": 0.24650557339191437, "timestamp": "2025-09-05 09:17:39.572449", "step": 3750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:39.741704", "step": 3750, "epoch": 3 }, { "type": "loss", "content": 0.3864779770374298, "timestamp": "2025-09-05 09:17:39.744322", "step": 3751, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:39.929467", "step": 3751, "epoch": 3 }, { "type": "loss", "content": 0.271431028842926, "timestamp": "2025-09-05 09:17:39.946728", "step": 3752, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:40.129181", "step": 3752, "epoch": 3 }, { "type": "loss", "content": 0.2465740144252777, "timestamp": "2025-09-05 09:17:40.131170", "step": 3753, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:40.290242", "step": 3753, "epoch": 3 }, { "type": "loss", "content": 0.18188771605491638, "timestamp": "2025-09-05 09:17:40.293833", "step": 3754, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:40.453968", "step": 3754, "epoch": 3 }, { "type": "loss", "content": 0.3101077079772949, "timestamp": "2025-09-05 09:17:40.465944", "step": 3755, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:40.631092", "step": 3755, "epoch": 3 }, { "type": "loss", "content": 0.3483135402202606, "timestamp": "2025-09-05 09:17:40.648326", "step": 3756, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:40.816767", "step": 3756, "epoch": 3 }, { "type": "loss", "content": 0.3474515676498413, "timestamp": "2025-09-05 09:17:40.820076", "step": 3757, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:40.995965", "step": 3757, "epoch": 3 }, { "type": "loss", "content": 0.3206429183483124, "timestamp": "2025-09-05 09:17:40.998901", "step": 3758, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:41.157141", "step": 3758, "epoch": 3 }, { "type": "loss", "content": 0.3244212567806244, "timestamp": "2025-09-05 09:17:41.159484", "step": 3759, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:17:41.332000", "step": 3759, "epoch": 3 }, { "type": "loss", "content": 0.1970166712999344, "timestamp": "2025-09-05 09:17:41.348675", "step": 3760, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:17:46.106629", "step": 3760, "epoch": 3 }, { "type": "pplx", "content": 56.030742259845105, "timestamp": "2025-09-05 09:17:46.109476", "step": 3760, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3760", "timestamp": "2025-09-05 09:17:46.628384", "step": 3760, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:46.817598", "step": 3760, "epoch": 3 }, { "type": "loss", "content": 0.24627402424812317, "timestamp": "2025-09-05 09:17:46.819504", "step": 3761, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:47.015458", "step": 3761, "epoch": 3 }, { "type": "loss", "content": 0.4071979224681854, "timestamp": "2025-09-05 09:17:47.017803", "step": 3762, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:47.225117", "step": 3762, "epoch": 3 }, { "type": "loss", "content": 0.2320803850889206, "timestamp": "2025-09-05 09:17:47.228767", "step": 3763, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:47.431687", "step": 3763, "epoch": 3 }, { "type": "loss", "content": 0.2511028051376343, "timestamp": "2025-09-05 09:17:47.447973", "step": 3764, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:47.644725", "step": 3764, "epoch": 3 }, { "type": "loss", "content": 0.1637968271970749, "timestamp": "2025-09-05 09:17:47.646815", "step": 3765, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:47.855989", "step": 3765, "epoch": 3 }, { "type": "loss", "content": 0.4574233293533325, "timestamp": "2025-09-05 09:17:47.858374", "step": 3766, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:48.053732", "step": 3766, "epoch": 3 }, { "type": "loss", "content": 0.2990650236606598, "timestamp": "2025-09-05 09:17:48.055922", "step": 3767, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:48.251155", "step": 3767, "epoch": 3 }, { "type": "loss", "content": 0.34469056129455566, "timestamp": "2025-09-05 09:17:48.264540", "step": 3768, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:48.451902", "step": 3768, "epoch": 3 }, { "type": "loss", "content": 0.1231929287314415, "timestamp": "2025-09-05 09:17:48.454219", "step": 3769, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:48.650029", "step": 3769, "epoch": 3 }, { "type": "loss", "content": 0.22926165163516998, "timestamp": "2025-09-05 09:17:48.652239", "step": 3770, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:48.855961", "step": 3770, "epoch": 3 }, { "type": "loss", "content": 0.1999165564775467, "timestamp": "2025-09-05 09:17:48.858063", "step": 3771, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:49.053725", "step": 3771, "epoch": 3 }, { "type": "loss", "content": 0.29211559891700745, "timestamp": "2025-09-05 09:17:49.068234", "step": 3772, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:49.260693", "step": 3772, "epoch": 3 }, { "type": "loss", "content": 0.2825232446193695, "timestamp": "2025-09-05 09:17:49.265059", "step": 3773, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:49.468313", "step": 3773, "epoch": 3 }, { "type": "loss", "content": 0.2714124917984009, "timestamp": "2025-09-05 09:17:49.473227", "step": 3774, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:49.685717", "step": 3774, "epoch": 3 }, { "type": "loss", "content": 0.3337889611721039, "timestamp": "2025-09-05 09:17:49.692216", "step": 3775, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:49.897333", "step": 3775, "epoch": 3 }, { "type": "loss", "content": 0.28709539771080017, "timestamp": "2025-09-05 09:17:49.911174", "step": 3776, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:50.100013", "step": 3776, "epoch": 3 }, { "type": "loss", "content": 0.3871142268180847, "timestamp": "2025-09-05 09:17:50.101949", "step": 3777, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:50.294723", "step": 3777, "epoch": 3 }, { "type": "loss", "content": 0.41545554995536804, "timestamp": "2025-09-05 09:17:50.296735", "step": 3778, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:50.491618", "step": 3778, "epoch": 3 }, { "type": "loss", "content": 0.33116820454597473, "timestamp": "2025-09-05 09:17:50.494035", "step": 3779, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:50.689535", "step": 3779, "epoch": 3 }, { "type": "loss", "content": 0.3106461763381958, "timestamp": "2025-09-05 09:17:50.705678", "step": 3780, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:17:55.438519", "step": 3780, "epoch": 3 }, { "type": "pplx", "content": 54.86477109539171, "timestamp": "2025-09-05 09:17:55.440640", "step": 3780, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:55.602465", "step": 3780, "epoch": 3 }, { "type": "loss", "content": 0.2829877734184265, "timestamp": "2025-09-05 09:17:55.604819", "step": 3781, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:55.772017", "step": 3781, "epoch": 3 }, { "type": "loss", "content": 0.23081792891025543, "timestamp": "2025-09-05 09:17:55.774253", "step": 3782, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:55.978453", "step": 3782, "epoch": 3 }, { "type": "loss", "content": 0.2172662913799286, "timestamp": "2025-09-05 09:17:55.980547", "step": 3783, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:17:56.174995", "step": 3783, "epoch": 3 }, { "type": "loss", "content": 0.23292969167232513, "timestamp": "2025-09-05 09:17:56.191374", "step": 3784, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:56.399032", "step": 3784, "epoch": 3 }, { "type": "loss", "content": 0.28087952733039856, "timestamp": "2025-09-05 09:17:56.404084", "step": 3785, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:56.613247", "step": 3785, "epoch": 3 }, { "type": "loss", "content": 0.3004533052444458, "timestamp": "2025-09-05 09:17:56.615302", "step": 3786, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:17:56.811021", "step": 3786, "epoch": 3 }, { "type": "loss", "content": 0.22735357284545898, "timestamp": "2025-09-05 09:17:56.813173", "step": 3787, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:57.008692", "step": 3787, "epoch": 3 }, { "type": "loss", "content": 0.24109166860580444, "timestamp": "2025-09-05 09:17:57.022332", "step": 3788, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:57.210727", "step": 3788, "epoch": 3 }, { "type": "loss", "content": 0.25741469860076904, "timestamp": "2025-09-05 09:17:57.212675", "step": 3789, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:57.407077", "step": 3789, "epoch": 3 }, { "type": "loss", "content": 0.18140548467636108, "timestamp": "2025-09-05 09:17:57.409333", "step": 3790, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:57.612999", "step": 3790, "epoch": 3 }, { "type": "loss", "content": 0.2594696879386902, "timestamp": "2025-09-05 09:17:57.615012", "step": 3791, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:57.810111", "step": 3791, "epoch": 3 }, { "type": "loss", "content": 0.2717820703983307, "timestamp": "2025-09-05 09:17:57.827484", "step": 3792, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:17:58.038952", "step": 3792, "epoch": 3 }, { "type": "loss", "content": 0.2998480200767517, "timestamp": "2025-09-05 09:17:58.041150", "step": 3793, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:58.237041", "step": 3793, "epoch": 3 }, { "type": "loss", "content": 0.5077316164970398, "timestamp": "2025-09-05 09:17:58.241772", "step": 3794, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:17:58.436343", "step": 3794, "epoch": 3 }, { "type": "loss", "content": 0.22378866374492645, "timestamp": "2025-09-05 09:17:58.438145", "step": 3795, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:17:58.632479", "step": 3795, "epoch": 3 }, { "type": "loss", "content": 0.2865094840526581, "timestamp": "2025-09-05 09:17:58.646480", "step": 3796, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:58.834363", "step": 3796, "epoch": 3 }, { "type": "loss", "content": 0.2359636276960373, "timestamp": "2025-09-05 09:17:58.836348", "step": 3797, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:59.039984", "step": 3797, "epoch": 3 }, { "type": "loss", "content": 0.2948862612247467, "timestamp": "2025-09-05 09:17:59.041770", "step": 3798, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:17:59.237989", "step": 3798, "epoch": 3 }, { "type": "loss", "content": 0.2476307600736618, "timestamp": "2025-09-05 09:17:59.243974", "step": 3799, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:17:59.445175", "step": 3799, "epoch": 3 }, { "type": "loss", "content": 0.2329883575439453, "timestamp": "2025-09-05 09:17:59.460504", "step": 3800, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:18:04.091371", "step": 3800, "epoch": 3 }, { "type": "pplx", "content": 55.55614732154259, "timestamp": "2025-09-05 09:18:04.093508", "step": 3800, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3800", "timestamp": "2025-09-05 09:18:04.825315", "step": 3800, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:04.992856", "step": 3800, "epoch": 3 }, { "type": "loss", "content": 0.30155929923057556, "timestamp": "2025-09-05 09:18:04.994966", "step": 3801, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:05.158446", "step": 3801, "epoch": 3 }, { "type": "loss", "content": 0.41426223516464233, "timestamp": "2025-09-05 09:18:05.164363", "step": 3802, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:05.370875", "step": 3802, "epoch": 3 }, { "type": "loss", "content": 0.36889323592185974, "timestamp": "2025-09-05 09:18:05.373209", "step": 3803, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:05.569555", "step": 3803, "epoch": 3 }, { "type": "loss", "content": 0.2549019753932953, "timestamp": "2025-09-05 09:18:05.585448", "step": 3804, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:05.780737", "step": 3804, "epoch": 3 }, { "type": "loss", "content": 0.46478819847106934, "timestamp": "2025-09-05 09:18:05.783228", "step": 3805, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:05.977843", "step": 3805, "epoch": 3 }, { "type": "loss", "content": 0.2971610724925995, "timestamp": "2025-09-05 09:18:05.979793", "step": 3806, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:06.175088", "step": 3806, "epoch": 3 }, { "type": "loss", "content": 0.18687593936920166, "timestamp": "2025-09-05 09:18:06.177352", "step": 3807, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:06.380428", "step": 3807, "epoch": 3 }, { "type": "loss", "content": 0.3274900019168854, "timestamp": "2025-09-05 09:18:06.394606", "step": 3808, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:06.582600", "step": 3808, "epoch": 3 }, { "type": "loss", "content": 0.21774989366531372, "timestamp": "2025-09-05 09:18:06.584933", "step": 3809, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:06.780351", "step": 3809, "epoch": 3 }, { "type": "loss", "content": 0.25480151176452637, "timestamp": "2025-09-05 09:18:06.783183", "step": 3810, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:06.988610", "step": 3810, "epoch": 3 }, { "type": "loss", "content": 0.28949615359306335, "timestamp": "2025-09-05 09:18:06.991180", "step": 3811, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:18:07.197046", "step": 3811, "epoch": 3 }, { "type": "loss", "content": 0.3601587116718292, "timestamp": "2025-09-05 09:18:07.213966", "step": 3812, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:07.409722", "step": 3812, "epoch": 3 }, { "type": "loss", "content": 0.2533739507198334, "timestamp": "2025-09-05 09:18:07.412817", "step": 3813, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:07.617938", "step": 3813, "epoch": 3 }, { "type": "loss", "content": 0.3216201364994049, "timestamp": "2025-09-05 09:18:07.620227", "step": 3814, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:07.823930", "step": 3814, "epoch": 3 }, { "type": "loss", "content": 0.26064953207969666, "timestamp": "2025-09-05 09:18:07.826145", "step": 3815, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:08.029069", "step": 3815, "epoch": 3 }, { "type": "loss", "content": 0.4234212338924408, "timestamp": "2025-09-05 09:18:08.045468", "step": 3816, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:08.239569", "step": 3816, "epoch": 3 }, { "type": "loss", "content": 0.22362388670444489, "timestamp": "2025-09-05 09:18:08.242035", "step": 3817, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:08.444873", "step": 3817, "epoch": 3 }, { "type": "loss", "content": 0.3728334307670593, "timestamp": "2025-09-05 09:18:08.447328", "step": 3818, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:08.649347", "step": 3818, "epoch": 3 }, { "type": "loss", "content": 0.46954891085624695, "timestamp": "2025-09-05 09:18:08.651921", "step": 3819, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:08.843539", "step": 3819, "epoch": 3 }, { "type": "loss", "content": 0.2965394854545593, "timestamp": "2025-09-05 09:18:08.860185", "step": 3820, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:18:13.496780", "step": 3820, "epoch": 3 }, { "type": "pplx", "content": 55.5854559436734, "timestamp": "2025-09-05 09:18:13.499381", "step": 3820, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:13.660779", "step": 3820, "epoch": 3 }, { "type": "loss", "content": 0.22472511231899261, "timestamp": "2025-09-05 09:18:13.664259", "step": 3821, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:13.866930", "step": 3821, "epoch": 3 }, { "type": "loss", "content": 0.25364920496940613, "timestamp": "2025-09-05 09:18:13.868953", "step": 3822, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:14.062950", "step": 3822, "epoch": 3 }, { "type": "loss", "content": 0.21925756335258484, "timestamp": "2025-09-05 09:18:14.065090", "step": 3823, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:14.259796", "step": 3823, "epoch": 3 }, { "type": "loss", "content": 0.2519068419933319, "timestamp": "2025-09-05 09:18:14.270679", "step": 3824, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:14.439212", "step": 3824, "epoch": 3 }, { "type": "loss", "content": 0.3214259743690491, "timestamp": "2025-09-05 09:18:14.441266", "step": 3825, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:14.644776", "step": 3825, "epoch": 3 }, { "type": "loss", "content": 0.30687010288238525, "timestamp": "2025-09-05 09:18:14.648046", "step": 3826, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:14.844746", "step": 3826, "epoch": 3 }, { "type": "loss", "content": 0.2706732451915741, "timestamp": "2025-09-05 09:18:14.846875", "step": 3827, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-05 09:18:15.039578", "step": 3827, "epoch": 3 }, { "type": "loss", "content": 0.21124766767024994, "timestamp": "2025-09-05 09:18:15.053944", "step": 3828, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:15.240514", "step": 3828, "epoch": 3 }, { "type": "loss", "content": 0.2378242313861847, "timestamp": "2025-09-05 09:18:15.242970", "step": 3829, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:15.436793", "step": 3829, "epoch": 3 }, { "type": "loss", "content": 0.37138789892196655, "timestamp": "2025-09-05 09:18:15.439225", "step": 3830, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:15.636069", "step": 3830, "epoch": 3 }, { "type": "loss", "content": 0.48219579458236694, "timestamp": "2025-09-05 09:18:15.639027", "step": 3831, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:15.844375", "step": 3831, "epoch": 3 }, { "type": "loss", "content": 0.31807440519332886, "timestamp": "2025-09-05 09:18:15.853360", "step": 3832, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:16.014950", "step": 3832, "epoch": 3 }, { "type": "loss", "content": 0.19948077201843262, "timestamp": "2025-09-05 09:18:16.016914", "step": 3833, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:16.222462", "step": 3833, "epoch": 3 }, { "type": "loss", "content": 0.3451708257198334, "timestamp": "2025-09-05 09:18:16.227796", "step": 3834, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:16.432408", "step": 3834, "epoch": 3 }, { "type": "loss", "content": 0.1525815725326538, "timestamp": "2025-09-05 09:18:16.434547", "step": 3835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:16.629223", "step": 3835, "epoch": 3 }, { "type": "loss", "content": 0.25581640005111694, "timestamp": "2025-09-05 09:18:16.644719", "step": 3836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:16.841468", "step": 3836, "epoch": 3 }, { "type": "loss", "content": 0.1911073625087738, "timestamp": "2025-09-05 09:18:16.843859", "step": 3837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:17.040110", "step": 3837, "epoch": 3 }, { "type": "loss", "content": 0.1686500757932663, "timestamp": "2025-09-05 09:18:17.042298", "step": 3838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:17.248084", "step": 3838, "epoch": 3 }, { "type": "loss", "content": 0.15377473831176758, "timestamp": "2025-09-05 09:18:17.250236", "step": 3839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:17.446870", "step": 3839, "epoch": 3 }, { "type": "loss", "content": 0.33481061458587646, "timestamp": "2025-09-05 09:18:17.461064", "step": 3840, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:18:22.080629", "step": 3840, "epoch": 3 }, { "type": "pplx", "content": 55.37722131907398, "timestamp": "2025-09-05 09:18:22.082611", "step": 3840, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3840", "timestamp": "2025-09-05 09:18:22.537339", "step": 3840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:22.697793", "step": 3840, "epoch": 3 }, { "type": "loss", "content": 0.2810446321964264, "timestamp": "2025-09-05 09:18:22.700095", "step": 3841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:22.893165", "step": 3841, "epoch": 3 }, { "type": "loss", "content": 0.14373518526554108, "timestamp": "2025-09-05 09:18:22.895756", "step": 3842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:23.098992", "step": 3842, "epoch": 3 }, { "type": "loss", "content": 0.3351125717163086, "timestamp": "2025-09-05 09:18:23.101228", "step": 3843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:23.296487", "step": 3843, "epoch": 3 }, { "type": "loss", "content": 0.3412880599498749, "timestamp": "2025-09-05 09:18:23.310519", "step": 3844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:23.496121", "step": 3844, "epoch": 3 }, { "type": "loss", "content": 0.22351853549480438, "timestamp": "2025-09-05 09:18:23.498529", "step": 3845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:23.692705", "step": 3845, "epoch": 3 }, { "type": "loss", "content": 0.31305429339408875, "timestamp": "2025-09-05 09:18:23.695051", "step": 3846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:18:23.860711", "step": 3846, "epoch": 3 }, { "type": "loss", "content": 0.3345796763896942, "timestamp": "2025-09-05 09:18:23.862968", "step": 3847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:24.066541", "step": 3847, "epoch": 3 }, { "type": "loss", "content": 0.2449566274881363, "timestamp": "2025-09-05 09:18:24.081763", "step": 3848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:24.276787", "step": 3848, "epoch": 3 }, { "type": "loss", "content": 0.23441752791404724, "timestamp": "2025-09-05 09:18:24.279105", "step": 3849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:24.474893", "step": 3849, "epoch": 3 }, { "type": "loss", "content": 0.2842091917991638, "timestamp": "2025-09-05 09:18:24.477000", "step": 3850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:18:24.673146", "step": 3850, "epoch": 3 }, { "type": "loss", "content": 0.4027588963508606, "timestamp": "2025-09-05 09:18:24.675318", "step": 3851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:24.871283", "step": 3851, "epoch": 3 }, { "type": "loss", "content": 0.36195552349090576, "timestamp": "2025-09-05 09:18:24.885328", "step": 3852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:25.070758", "step": 3852, "epoch": 3 }, { "type": "loss", "content": 0.21417009830474854, "timestamp": "2025-09-05 09:18:25.073293", "step": 3853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:25.268334", "step": 3853, "epoch": 3 }, { "type": "loss", "content": 0.21039460599422455, "timestamp": "2025-09-05 09:18:25.270383", "step": 3854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:25.464573", "step": 3854, "epoch": 3 }, { "type": "loss", "content": 0.2987714409828186, "timestamp": "2025-09-05 09:18:25.466698", "step": 3855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:25.628851", "step": 3855, "epoch": 3 }, { "type": "loss", "content": 0.3201431930065155, "timestamp": "2025-09-05 09:18:25.645120", "step": 3856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:25.839423", "step": 3856, "epoch": 3 }, { "type": "loss", "content": 0.3816319704055786, "timestamp": "2025-09-05 09:18:25.841608", "step": 3857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:26.035555", "step": 3857, "epoch": 3 }, { "type": "loss", "content": 0.29688769578933716, "timestamp": "2025-09-05 09:18:26.037739", "step": 3858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:26.230567", "step": 3858, "epoch": 3 }, { "type": "loss", "content": 0.3592606484889984, "timestamp": "2025-09-05 09:18:26.232683", "step": 3859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:26.426548", "step": 3859, "epoch": 3 }, { "type": "loss", "content": 0.3628304898738861, "timestamp": "2025-09-05 09:18:26.440593", "step": 3860, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:18:31.083837", "step": 3860, "epoch": 3 }, { "type": "pplx", "content": 56.521663876148416, "timestamp": "2025-09-05 09:18:31.085627", "step": 3860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:31.245688", "step": 3860, "epoch": 3 }, { "type": "loss", "content": 0.2278020679950714, "timestamp": "2025-09-05 09:18:31.248250", "step": 3861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:31.414379", "step": 3861, "epoch": 3 }, { "type": "loss", "content": 0.2553630471229553, "timestamp": "2025-09-05 09:18:31.416714", "step": 3862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:31.622523", "step": 3862, "epoch": 3 }, { "type": "loss", "content": 0.31360796093940735, "timestamp": "2025-09-05 09:18:31.625098", "step": 3863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:31.821089", "step": 3863, "epoch": 3 }, { "type": "loss", "content": 0.23280321061611176, "timestamp": "2025-09-05 09:18:31.835268", "step": 3864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:32.022303", "step": 3864, "epoch": 3 }, { "type": "loss", "content": 0.2609652876853943, "timestamp": "2025-09-05 09:18:32.024305", "step": 3865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:18:32.226513", "step": 3865, "epoch": 3 }, { "type": "loss", "content": 0.44476309418678284, "timestamp": "2025-09-05 09:18:32.228929", "step": 3866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:32.422074", "step": 3866, "epoch": 3 }, { "type": "loss", "content": 0.1847713738679886, "timestamp": "2025-09-05 09:18:32.424353", "step": 3867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:32.618727", "step": 3867, "epoch": 3 }, { "type": "loss", "content": 0.31068137288093567, "timestamp": "2025-09-05 09:18:32.633213", "step": 3868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:32.820267", "step": 3868, "epoch": 3 }, { "type": "loss", "content": 0.2597085237503052, "timestamp": "2025-09-05 09:18:32.822691", "step": 3869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:33.026080", "step": 3869, "epoch": 3 }, { "type": "loss", "content": 0.27988213300704956, "timestamp": "2025-09-05 09:18:33.028025", "step": 3870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:33.222332", "step": 3870, "epoch": 3 }, { "type": "loss", "content": 0.28773733973503113, "timestamp": "2025-09-05 09:18:33.224225", "step": 3871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:33.418058", "step": 3871, "epoch": 3 }, { "type": "loss", "content": 0.2838188707828522, "timestamp": "2025-09-05 09:18:33.432683", "step": 3872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:33.620117", "step": 3872, "epoch": 3 }, { "type": "loss", "content": 0.31004562973976135, "timestamp": "2025-09-05 09:18:33.622110", "step": 3873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:33.815513", "step": 3873, "epoch": 3 }, { "type": "loss", "content": 0.19978618621826172, "timestamp": "2025-09-05 09:18:33.817791", "step": 3874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:34.021912", "step": 3874, "epoch": 3 }, { "type": "loss", "content": 0.18819376826286316, "timestamp": "2025-09-05 09:18:34.024432", "step": 3875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:34.219846", "step": 3875, "epoch": 3 }, { "type": "loss", "content": 0.2756434381008148, "timestamp": "2025-09-05 09:18:34.234092", "step": 3876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:18:34.423758", "step": 3876, "epoch": 3 }, { "type": "loss", "content": 0.22656309604644775, "timestamp": "2025-09-05 09:18:34.426614", "step": 3877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:18:34.632024", "step": 3877, "epoch": 3 }, { "type": "loss", "content": 0.3541288375854492, "timestamp": "2025-09-05 09:18:34.633953", "step": 3878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:34.798791", "step": 3878, "epoch": 3 }, { "type": "loss", "content": 0.1255226582288742, "timestamp": "2025-09-05 09:18:34.800876", "step": 3879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:34.964815", "step": 3879, "epoch": 3 }, { "type": "loss", "content": 0.345575213432312, "timestamp": "2025-09-05 09:18:34.981003", "step": 3880, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:18:39.638308", "step": 3880, "epoch": 3 }, { "type": "pplx", "content": 57.5111446457821, "timestamp": "2025-09-05 09:18:39.640566", "step": 3880, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3880", "timestamp": "2025-09-05 09:18:40.303411", "step": 3880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:40.494156", "step": 3880, "epoch": 3 }, { "type": "loss", "content": 0.29227620363235474, "timestamp": "2025-09-05 09:18:40.496215", "step": 3881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:40.708619", "step": 3881, "epoch": 3 }, { "type": "loss", "content": 0.30040618777275085, "timestamp": "2025-09-05 09:18:40.710697", "step": 3882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:40.906165", "step": 3882, "epoch": 3 }, { "type": "loss", "content": 0.2703894376754761, "timestamp": "2025-09-05 09:18:40.908114", "step": 3883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:41.104515", "step": 3883, "epoch": 3 }, { "type": "loss", "content": 0.344621866941452, "timestamp": "2025-09-05 09:18:41.120488", "step": 3884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:41.315270", "step": 3884, "epoch": 3 }, { "type": "loss", "content": 0.17300912737846375, "timestamp": "2025-09-05 09:18:41.318283", "step": 3885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 4480027263872.0 }, "timestamp": "2025-09-05 09:18:41.525061", "step": 3885, "epoch": 3 }, { "type": "loss", "content": 0.4111814796924591, "timestamp": "2025-09-05 09:18:41.527036", "step": 3886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:41.724233", "step": 3886, "epoch": 3 }, { "type": "loss", "content": 0.32882702350616455, "timestamp": "2025-09-05 09:18:41.726372", "step": 3887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 4800029206464.0 }, "timestamp": "2025-09-05 09:18:41.932520", "step": 3887, "epoch": 3 }, { "type": "loss", "content": 0.361674040555954, "timestamp": "2025-09-05 09:18:41.941725", "step": 3888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:42.104101", "step": 3888, "epoch": 3 }, { "type": "loss", "content": 0.1684613674879074, "timestamp": "2025-09-05 09:18:42.105741", "step": 3889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:42.309251", "step": 3889, "epoch": 3 }, { "type": "loss", "content": 0.20353275537490845, "timestamp": "2025-09-05 09:18:42.311380", "step": 3890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:42.514218", "step": 3890, "epoch": 3 }, { "type": "loss", "content": 0.22002647817134857, "timestamp": "2025-09-05 09:18:42.516313", "step": 3891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:42.711114", "step": 3891, "epoch": 3 }, { "type": "loss", "content": 0.3730429708957672, "timestamp": "2025-09-05 09:18:42.724982", "step": 3892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-05 09:18:42.912985", "step": 3892, "epoch": 3 }, { "type": "loss", "content": 0.30563536286354065, "timestamp": "2025-09-05 09:18:42.915080", "step": 3893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:43.110081", "step": 3893, "epoch": 3 }, { "type": "loss", "content": 0.3045671880245209, "timestamp": "2025-09-05 09:18:43.112307", "step": 3894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 3840023378688.0 }, "timestamp": "2025-09-05 09:18:43.329512", "step": 3894, "epoch": 3 }, { "type": "loss", "content": 0.2163662165403366, "timestamp": "2025-09-05 09:18:43.331571", "step": 3895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 4160025321280.0 }, "timestamp": "2025-09-05 09:18:43.529117", "step": 3895, "epoch": 3 }, { "type": "loss", "content": 0.2787623703479767, "timestamp": "2025-09-05 09:18:43.537962", "step": 3896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:43.700965", "step": 3896, "epoch": 3 }, { "type": "loss", "content": 0.3941419720649719, "timestamp": "2025-09-05 09:18:43.703207", "step": 3897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:43.867354", "step": 3897, "epoch": 3 }, { "type": "loss", "content": 0.22157038748264313, "timestamp": "2025-09-05 09:18:43.869374", "step": 3898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-05 09:18:44.035260", "step": 3898, "epoch": 3 }, { "type": "loss", "content": 0.21150384843349457, "timestamp": "2025-09-05 09:18:44.037093", "step": 3899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-05 09:18:44.202709", "step": 3899, "epoch": 3 }, { "type": "loss", "content": 0.14150860905647278, "timestamp": "2025-09-05 09:18:44.211577", "step": 3900, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:18:48.840423", "step": 3900, "epoch": 3 }, { "type": "pplx", "content": 57.68526548554968, "timestamp": "2025-09-05 09:18:48.842740", "step": 3900, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4151977605760 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3832594718208 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3193828943104 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2555063168000 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2874446055552 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3513211830656 }, { "type": "perplexity", "in_batch_dim": [ 2, 160 ], "batch_size": 8, "flops": 3193828943104 } ], "timestamp": "2025-09-05 09:18:53.413113", "step": 3900, "epoch": 3 }, { "type": "pplx", "content": 57.68526548554968, "timestamp": "2025-09-05 09:18:53.415162", "step": 3900, "epoch": 3 }, { "type": "best_pplx", "content": 52.40073254912315, "timestamp": "2025-09-05 09:18:53.416944", "step": 3900, "epoch": 3 }, { "type": "best_step", "content": 3040, "timestamp": "2025-09-05 09:18:53.418308", "step": 3900, "epoch": 3 }, { "type": "total_pplx_flops", "content": 49705559881469696, "timestamp": "2025-09-05 09:18:53.420522", "step": 3900, "epoch": 3 }, { "type": "total_train_flops", "content": 1.368968336766336e+16, "timestamp": "2025-09-05 09:18:53.781419", "step": 3900, "epoch": 3 } ], "best_evals": { "pplx": { "score": 52.40073254912315, "step": 3040 }, "rouge1": { "precision": 0.30901464044392746, "recall": 0.2632675026888113, "fmeasure": 0.2558464239565212 } } }